goldengoose-divsweep_goose_…/checkpoint-50/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.11160714285714286,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1786.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 541.859375,
      "completions/mean_terminated_length": 541.859375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.002232142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.971773147583008,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 112399.0,
      "reward": 0.25312498211860657,
      "reward_std": 0.35192999243736267,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1913.0,
      "completions/max_terminated_length": 1913.0,
      "completions/mean_length": 460.625,
      "completions/mean_terminated_length": 460.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.004464285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.931671142578125,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 239831.0,
      "reward": 0.33671873807907104,
      "reward_std": 0.35391804575920105,
      "rewards/format_reward/mean": 0.3984375,
      "rewards/format_reward/std": 0.20275264978408813,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1663.0,
      "completions/max_terminated_length": 1663.0,
      "completions/mean_length": 578.5625,
      "completions/mean_terminated_length": 578.5625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.006696428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.283196210861206,
      "learning_rate": 4e-07,
      "loss": 0.0,
      "num_tokens": 357427.0,
      "reward": 0.28125,
      "reward_std": 0.3595561385154724,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.279951810836792,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1725.0,
      "completions/mean_length": 508.953125,
      "completions/mean_terminated_length": 484.5238342285156,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.008928571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.620399475097656,
      "learning_rate": 6e-07,
      "loss": 0.0,
      "num_tokens": 498696.0,
      "reward": 0.38203126192092896,
      "reward_std": 0.4592018127441406,
      "rewards/format_reward/mean": 0.3828125,
      "rewards/format_reward/std": 0.2634054720401764,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1176.0,
      "completions/max_terminated_length": 1176.0,
      "completions/mean_length": 443.328125,
      "completions/mean_terminated_length": 443.328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.011160714285714286,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 23.866165161132812,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 606045.0,
      "reward": 0.23671872913837433,
      "reward_std": 0.23968853056430817,
      "rewards/format_reward/mean": 0.3359375,
      "rewards/format_reward/std": 0.2824873626232147,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1733.0,
      "completions/mean_length": 569.5,
      "completions/mean_terminated_length": 521.8064575195312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.013392857142857142,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 4.394739627838135,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 764053.0,
      "reward": 0.3164062201976776,
      "reward_std": 0.24395309388637543,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.2302463799715042,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 531.71875,
      "completions/mean_terminated_length": 507.65081787109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.567261695861816,
      "learning_rate": 9.99726628670463e-07,
      "loss": -0.0,
      "num_tokens": 895587.0,
      "reward": 0.2867187261581421,
      "reward_std": 0.34152650833129883,
      "rewards/format_reward/mean": 0.3671875,
      "rewards/format_reward/std": 0.22257846593856812,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1868.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 575.71875,
      "completions/mean_terminated_length": 575.71875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.017857142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.7189435958862305,
      "learning_rate": 9.989068136093872e-07,
      "loss": 0.0,
      "num_tokens": 1015801.0,
      "reward": 0.26093748211860657,
      "reward_std": 0.2545996308326721,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.20351573824882507,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 584.140625,
      "completions/mean_terminated_length": 560.90478515625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.020089285714285716,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 6.5300421714782715,
      "learning_rate": 9.975414512725056e-07,
      "loss": -0.0,
      "num_tokens": 1153538.0,
      "reward": 0.3515625,
      "reward_std": 0.40456390380859375,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.24346621334552765,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1695.0,
      "completions/mean_length": 586.046875,
      "completions/mean_terminated_length": 562.84130859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.022321428571428572,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 4.844789028167725,
      "learning_rate": 9.956320346634875e-07,
      "loss": -0.0,
      "num_tokens": 1279109.0,
      "reward": 0.31874996423721313,
      "reward_std": 0.30643922090530396,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 403.140625,
      "completions/mean_terminated_length": 350.08062744140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.024553571428571428,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 25.20620346069336,
      "learning_rate": 9.931806517013612e-07,
      "loss": -0.0,
      "num_tokens": 1407374.0,
      "reward": 0.35468748211860657,
      "reward_std": 0.24326452612876892,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.2221602201461792,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1665.0,
      "completions/mean_length": 540.59375,
      "completions/mean_terminated_length": 491.96771240234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.026785714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.811991691589355,
      "learning_rate": 9.901899829374047e-07,
      "loss": -0.0,
      "num_tokens": 1536228.0,
      "reward": 0.2890624701976776,
      "reward_std": 0.38433414697647095,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.24346621334552765,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1231.0,
      "completions/mean_length": 437.3125,
      "completions/mean_terminated_length": 385.3548278808594,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.029017857142857144,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 9.110626220703125,
      "learning_rate": 9.866632986240029e-07,
      "loss": -0.0,
      "num_tokens": 1661912.0,
      "reward": 0.38593748211860657,
      "reward_std": 0.3386583626270294,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.18298126757144928,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1989.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 396.1875,
      "completions/mean_terminated_length": 396.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 22.122051239013672,
      "learning_rate": 9.826044551386742e-07,
      "loss": 0.0,
      "num_tokens": 1785052.0,
      "reward": 0.5171874761581421,
      "reward_std": 0.37966397404670715,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.29504841566085815,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1319.0,
      "completions/mean_length": 398.1875,
      "completions/mean_terminated_length": 344.9677429199219,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.033482142857142856,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 8.21301555633545,
      "learning_rate": 9.780178907671788e-07,
      "loss": -0.0,
      "num_tokens": 1906464.0,
      "reward": 0.23906248807907104,
      "reward_std": 0.214445561170578,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1985812783241272,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1598.0,
      "completions/mean_length": 539.109375,
      "completions/mean_terminated_length": 411.2372741699219,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.03571428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.956470489501953,
      "learning_rate": 9.729086208503173e-07,
      "loss": -0.0,
      "num_tokens": 2039351.0,
      "reward": 0.4976562261581421,
      "reward_std": 0.4534168541431427,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.26899561285972595,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1142.0,
      "completions/mean_length": 333.8125,
      "completions/mean_terminated_length": 306.6031799316406,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03794642857142857,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 19.72380256652832,
      "learning_rate": 9.672822322997304e-07,
      "loss": -0.0,
      "num_tokens": 2139947.0,
      "reward": 0.3046875,
      "reward_std": 0.3565414547920227,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.2916666865348816,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1142.0,
      "completions/mean_length": 248.0625,
      "completions/mean_terminated_length": 219.49208068847656,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.04017857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 27.903902053833008,
      "learning_rate": 9.611448774886923e-07,
      "loss": 0.0,
      "num_tokens": 2281071.0,
      "reward": 0.18984374403953552,
      "reward_std": 0.21500104665756226,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.18881812691688538,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1478.0,
      "completions/mean_length": 237.8125,
      "completions/mean_terminated_length": 209.07937622070312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04241071428571429,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 23.812612533569336,
      "learning_rate": 9.545032675245813e-07,
      "loss": -0.0,
      "num_tokens": 2410507.0,
      "reward": 0.3968749940395355,
      "reward_std": 0.37050747871398926,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.21593283116817474,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 832.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 102.78125,
      "completions/mean_terminated_length": 102.78125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.044642857142857144,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 17.30659294128418,
      "learning_rate": 9.473646649103817e-07,
      "loss": -0.0,
      "num_tokens": 2492773.0,
      "reward": 0.5757812261581421,
      "reward_std": 0.4401569366455078,
      "rewards/format_reward/mean": 0.6015625,
      "rewards/format_reward/std": 0.28423789143562317,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 685.0,
      "completions/mean_length": 71.796875,
      "completions/mean_terminated_length": 40.42857360839844,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.046875,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.079203605651855,
      "learning_rate": 9.397368756032444e-07,
      "loss": 0.0,
      "num_tokens": 2624568.0,
      "reward": 0.10312500596046448,
      "reward_std": 0.10811922699213028,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.22712838649749756,
      "rewards/mcq_exact_match_reward/mean": 0.046875,
      "rewards/mcq_exact_match_reward/std": 0.21304203569889069,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1416.0,
      "completions/max_terminated_length": 1416.0,
      "completions/mean_length": 105.59375,
      "completions/mean_terminated_length": 105.59375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.049107142857142856,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.665273666381836,
      "learning_rate": 9.316282404787869e-07,
      "loss": -0.0,
      "num_tokens": 2746590.0,
      "reward": 0.3062500059604645,
      "reward_std": 0.26257947087287903,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.18898223340511322,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 111.84375,
      "completions/mean_terminated_length": 81.11111450195312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05133928571428571,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 17.90364646911621,
      "learning_rate": 9.230476262104676e-07,
      "loss": -0.0,
      "num_tokens": 2852316.0,
      "reward": 0.26093751192092896,
      "reward_std": 0.11917313188314438,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.2221602201461792,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 721.0,
      "completions/max_terminated_length": 721.0,
      "completions/mean_length": 59.4375,
      "completions/mean_terminated_length": 59.4375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05357142857142857,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 23.127439498901367,
      "learning_rate": 9.1400441557401e-07,
      "loss": 0.0,
      "num_tokens": 2948880.0,
      "reward": 0.4039062261581421,
      "reward_std": 0.3720070719718933,
      "rewards/format_reward/mean": 0.6015625,
      "rewards/format_reward/std": 0.28423789143562317,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 473.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 38.484375,
      "completions/mean_terminated_length": 38.484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05580357142857143,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 23.131183624267578,
      "learning_rate": 9.045084971874737e-07,
      "loss": -0.0,
      "num_tokens": 3042983.0,
      "reward": 0.49609375,
      "reward_std": 0.39612793922424316,
      "rewards/format_reward/mean": 0.5859375,
      "rewards/format_reward/std": 0.244862899184227,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 396.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 14.578125,
      "completions/mean_terminated_length": 14.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05803571428571429,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 21.188722610473633,
      "learning_rate": 8.945702546981968e-07,
      "loss": -0.0,
      "num_tokens": 3153444.0,
      "reward": 0.26874998211860657,
      "reward_std": 0.29173365235328674,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.08908708393573761,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 8.234375,
      "completions/mean_terminated_length": 8.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.060267857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 31.691007614135742,
      "learning_rate": 8.842005554284295e-07,
      "loss": 0.0,
      "num_tokens": 3245059.0,
      "reward": 0.3148437738418579,
      "reward_std": 0.4270421266555786,
      "rewards/format_reward/mean": 0.6484375,
      "rewards/format_reward/std": 0.2302463799715042,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 910.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 23.265625,
      "completions/mean_terminated_length": 23.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 24.359390258789062,
      "learning_rate": 8.734107384920769e-07,
      "loss": 0.0,
      "num_tokens": 3361140.0,
      "reward": 0.40156251192092896,
      "reward_std": 0.28653684258461,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.18298126757144928,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 9.609375,
      "completions/mean_terminated_length": 9.609375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06473214285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 17.99944305419922,
      "learning_rate": 8.622126023955445e-07,
      "loss": 0.0,
      "num_tokens": 3490331.0,
      "reward": 0.20546874403953552,
      "reward_std": 0.17656923830509186,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 6.625,
      "completions/mean_terminated_length": 6.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06696428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.938568115234375,
      "learning_rate": 8.506183921362442e-07,
      "loss": 0.0,
      "num_tokens": 3605843.0,
      "reward": 0.2874999940395355,
      "reward_std": 0.11888101696968079,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.12198751419782639,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 33.0,
      "completions/max_terminated_length": 33.0,
      "completions/mean_length": 6.421875,
      "completions/mean_terminated_length": 6.421875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06919642857142858,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 20.33233070373535,
      "learning_rate": 8.386407858128706e-07,
      "loss": -0.0,
      "num_tokens": 3710350.0,
      "reward": 0.44062498211860657,
      "reward_std": 0.210945725440979,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 6.109375,
      "completions/mean_terminated_length": 6.109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 26.781448364257812,
      "learning_rate": 8.262928807620843e-07,
      "loss": 0.0,
      "num_tokens": 3807701.0,
      "reward": 0.2984374761581421,
      "reward_std": 0.16964475810527802,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 6.84375,
      "completions/mean_terminated_length": 6.84375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07366071428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 22.09980583190918,
      "learning_rate": 8.135881792367685e-07,
      "loss": 0.0,
      "num_tokens": 3908627.0,
      "reward": 0.39921873807907104,
      "reward_std": 0.1689612716436386,
      "rewards/format_reward/mean": 0.5546875,
      "rewards/format_reward/std": 0.180765300989151,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 568.0,
      "completions/max_terminated_length": 568.0,
      "completions/mean_length": 19.828125,
      "completions/mean_terminated_length": 19.828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07589285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 24.214155197143555,
      "learning_rate": 8.005405736415125e-07,
      "loss": 0.0,
      "num_tokens": 3997160.0,
      "reward": 0.3937499523162842,
      "reward_std": 0.3061639666557312,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 476.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 16.640625,
      "completions/mean_terminated_length": 16.640625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.078125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 19.65945053100586,
      "learning_rate": 7.871643313414718e-07,
      "loss": -0.0,
      "num_tokens": 4102257.0,
      "reward": 0.33124998211860657,
      "reward_std": 0.3424546718597412,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 9.828125,
      "completions/mean_terminated_length": 9.828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08035714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 16.9875545501709,
      "learning_rate": 7.734740790612136e-07,
      "loss": 0.0,
      "num_tokens": 4218214.0,
      "reward": 0.40625,
      "reward_std": 0.22461532056331635,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 16.703125,
      "completions/mean_terminated_length": 16.703125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08258928571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.114908218383789,
      "learning_rate": 7.594847868906076e-07,
      "loss": -0.0,
      "num_tokens": 4304195.0,
      "reward": 0.5093749761581421,
      "reward_std": 0.1893727034330368,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 6.8125,
      "completions/mean_terminated_length": 6.8125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08482142857142858,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.950824737548828,
      "learning_rate": 7.452117519152541e-07,
      "loss": -0.0,
      "num_tokens": 4430631.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.29143062233924866,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 10.125,
      "completions/mean_terminated_length": 10.125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08705357142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 25.844154357910156,
      "learning_rate": 7.306705814893439e-07,
      "loss": -0.0,
      "num_tokens": 4522799.0,
      "reward": 0.4312499761581421,
      "reward_std": 0.2177756428718567,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 60.0,
      "completions/max_terminated_length": 60.0,
      "completions/mean_length": 8.203125,
      "completions/mean_terminated_length": 8.203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08928571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 31.361331939697266,
      "learning_rate": 7.158771761692464e-07,
      "loss": 0.0,
      "num_tokens": 4608428.0,
      "reward": 0.6703124642372131,
      "reward_std": 0.049638569355010986,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.2083333432674408,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 8.90625,
      "completions/mean_terminated_length": 8.90625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09151785714285714,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 17.690139770507812,
      "learning_rate": 7.008477123264847e-07,
      "loss": 0.0,
      "num_tokens": 4691318.0,
      "reward": 0.5726562738418579,
      "reward_std": 0.2708982527256012,
      "rewards/format_reward/mean": 0.7265625,
      "rewards/format_reward/std": 0.250866562128067,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 6.09375,
      "completions/mean_terminated_length": 6.09375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 18.50206756591797,
      "learning_rate": 6.855986244591103e-07,
      "loss": -0.0,
      "num_tokens": 4789988.0,
      "reward": 0.4257812201976776,
      "reward_std": 0.2243541181087494,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 359.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 16.03125,
      "completions/mean_terminated_length": 16.03125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09598214285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 54.797157287597656,
      "learning_rate": 6.701465872208216e-07,
      "loss": -0.0,
      "num_tokens": 4877334.0,
      "reward": 0.8117187023162842,
      "reward_std": 0.18093490600585938,
      "rewards/format_reward/mean": 0.6171875,
      "rewards/format_reward/std": 0.21347814798355103,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 7.515625,
      "completions/mean_terminated_length": 7.515625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09821428571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 12.843533515930176,
      "learning_rate": 6.545084971874736e-07,
      "loss": 0.0,
      "num_tokens": 4983247.0,
      "reward": 0.59375,
      "reward_std": 0.16675157845020294,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 8.375,
      "completions/mean_terminated_length": 8.375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10044642857142858,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 21.998193740844727,
      "learning_rate": 6.387014543809223e-07,
      "loss": -0.0,
      "num_tokens": 5059511.0,
      "reward": 0.49140626192092896,
      "reward_std": 0.20160752534866333,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.24587368965148926,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 7.546875,
      "completions/mean_terminated_length": 7.546875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10267857142857142,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 28.897733688354492,
      "learning_rate": 6.227427435703995e-07,
      "loss": -0.0,
      "num_tokens": 5150106.0,
      "reward": 0.28125,
      "reward_std": 0.28247907757759094,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 6.109375,
      "completions/mean_terminated_length": 6.109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10491071428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 23.489076614379883,
      "learning_rate": 6.066498153718734e-07,
      "loss": -0.0,
      "num_tokens": 5243569.0,
      "reward": 0.5351561903953552,
      "reward_std": 0.18003448843955994,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 9.90625,
      "completions/mean_terminated_length": 9.90625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.710181713104248,
      "learning_rate": 5.90440267166055e-07,
      "loss": 0.0,
      "num_tokens": 5309691.0,
      "reward": 0.17500001192092896,
      "reward_std": 0.10888782143592834,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.24397502839565277,
      "rewards/mcq_exact_match_reward/mean": 0.09375,
      "rewards/mcq_exact_match_reward/std": 0.29378482699394226,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 43.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 8.8125,
      "completions/mean_terminated_length": 8.8125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.109375,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 21.57369613647461,
      "learning_rate": 5.741318238559209e-07,
      "loss": 0.0,
      "num_tokens": 5411943.0,
      "reward": 0.5812499523162842,
      "reward_std": 0.2802783250808716,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 572.0,
      "completions/max_terminated_length": 572.0,
      "completions/mean_length": 20.78125,
      "completions/mean_terminated_length": 20.78125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.11160714285714286,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 31.959693908691406,
      "learning_rate": 5.577423184847931e-07,
      "loss": 0.0,
      "num_tokens": 5512089.0,
      "reward": 0.40937498211860657,
      "reward_std": 0.20373183488845825,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.233588308095932,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 50
    }
  ],
  "logging_steps": 1,
  "max_steps": 100,
  "num_input_tokens_seen": 5512089,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}