goldengoose-divsweep_goose_…/checkpoint-50/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.11160714285714286,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1386.0,
      "completions/mean_length": 515.03125,
      "completions/mean_terminated_length": 490.6984558105469,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.002232142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.372617721557617,
      "learning_rate": 0.0,
      "loss": 0.0,
      "num_tokens": 116874.0,
      "reward": 0.296875,
      "reward_std": 0.36100417375564575,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.24397502839565277,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1737.0,
      "completions/max_terminated_length": 1737.0,
      "completions/mean_length": 464.234375,
      "completions/mean_terminated_length": 464.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.004464285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.42634105682373,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 230473.0,
      "reward": 0.33281248807907104,
      "reward_std": 0.35629093647003174,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.28824523091316223,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1479.0,
      "completions/max_terminated_length": 1479.0,
      "completions/mean_length": 375.984375,
      "completions/mean_terminated_length": 375.984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.006696428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.815962791442871,
      "learning_rate": 4e-07,
      "loss": 0.0,
      "num_tokens": 346168.0,
      "reward": 0.21718749403953552,
      "reward_std": 0.3329676389694214,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.2630521357059479,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1428.0,
      "completions/mean_length": 490.515625,
      "completions/mean_terminated_length": 386.683349609375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.008928571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.687461853027344,
      "learning_rate": 6e-07,
      "loss": 0.0,
      "num_tokens": 486377.0,
      "reward": 0.2593749761581421,
      "reward_std": 0.3835652470588684,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1506.0,
      "completions/mean_length": 606.28125,
      "completions/mean_terminated_length": 535.3770141601562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.011160714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.5729923248291,
      "learning_rate": 8e-07,
      "loss": 0.0,
      "num_tokens": 620971.0,
      "reward": 0.23984374105930328,
      "reward_std": 0.3459582030773163,
      "rewards/format_reward/mean": 0.3671875,
      "rewards/format_reward/std": 0.23974503576755524,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1333.0,
      "completions/mean_length": 531.765625,
      "completions/mean_terminated_length": 482.8548278808594,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.013392857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.315047264099121,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 739660.0,
      "reward": 0.28203123807907104,
      "reward_std": 0.23655115067958832,
      "rewards/format_reward/mean": 0.3203125,
      "rewards/format_reward/std": 0.27265870571136475,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1883.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 534.65625,
      "completions/mean_terminated_length": 534.65625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.712610244750977,
      "learning_rate": 9.99726628670463e-07,
      "loss": -0.0,
      "num_tokens": 868742.0,
      "reward": 0.3554687201976776,
      "reward_std": 0.20643417537212372,
      "rewards/format_reward/mean": 0.4296875,
      "rewards/format_reward/std": 0.23345555365085602,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 639.4375,
      "completions/mean_terminated_length": 570.1638793945312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.017857142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.94524097442627,
      "learning_rate": 9.989068136093872e-07,
      "loss": 0.0,
      "num_tokens": 1012874.0,
      "reward": 0.20781248807907104,
      "reward_std": 0.30935126543045044,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2741328477859497,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1730.0,
      "completions/max_terminated_length": 1730.0,
      "completions/mean_length": 345.8125,
      "completions/mean_terminated_length": 345.8125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.020089285714285716,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 10.861742973327637,
      "learning_rate": 9.975414512725056e-07,
      "loss": -0.0,
      "num_tokens": 1141086.0,
      "reward": 0.2593749761581421,
      "reward_std": 0.2899813652038574,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.21593283116817474,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1430.0,
      "completions/max_terminated_length": 1430.0,
      "completions/mean_length": 466.390625,
      "completions/mean_terminated_length": 466.390625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.022321428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.960936546325684,
      "learning_rate": 9.956320346634875e-07,
      "loss": 0.0,
      "num_tokens": 1260815.0,
      "reward": 0.27421873807907104,
      "reward_std": 0.3824688494205475,
      "rewards/format_reward/mean": 0.3984375,
      "rewards/format_reward/std": 0.28423789143562317,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1601.0,
      "completions/mean_length": 428.53125,
      "completions/mean_terminated_length": 376.2903137207031,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.024553571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 18.18084716796875,
      "learning_rate": 9.931806517013612e-07,
      "loss": 0.0,
      "num_tokens": 1379769.0,
      "reward": 0.23281249403953552,
      "reward_std": 0.2880294919013977,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.2630521357059479,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 680.0,
      "completions/max_terminated_length": 680.0,
      "completions/mean_length": 135.90625,
      "completions/mean_terminated_length": 135.90625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.026785714285714284,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 17.66448402404785,
      "learning_rate": 9.901899829374047e-07,
      "loss": -0.0,
      "num_tokens": 1470971.0,
      "reward": 0.4703124761581421,
      "reward_std": 0.28840553760528564,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.17747680842876434,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1568.0,
      "completions/max_terminated_length": 1568.0,
      "completions/mean_length": 217.578125,
      "completions/mean_terminated_length": 217.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.029017857142857144,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.263340950012207,
      "learning_rate": 9.866632986240029e-07,
      "loss": 0.0,
      "num_tokens": 1578096.0,
      "reward": 0.2523437440395355,
      "reward_std": 0.11321917921304703,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 842.0,
      "completions/max_terminated_length": 842.0,
      "completions/mean_length": 99.265625,
      "completions/mean_terminated_length": 99.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03125,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 18.594722747802734,
      "learning_rate": 9.826044551386742e-07,
      "loss": 0.0,
      "num_tokens": 1670537.0,
      "reward": 0.42812496423721313,
      "reward_std": 0.3584539592266083,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 533.0,
      "completions/max_terminated_length": 533.0,
      "completions/mean_length": 27.75,
      "completions/mean_terminated_length": 27.75,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.033482142857142856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.696029663085938,
      "learning_rate": 9.780178907671788e-07,
      "loss": -0.0,
      "num_tokens": 1772369.0,
      "reward": 0.5031249523162842,
      "reward_std": 0.11100947111845016,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03571428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.516544342041016,
      "learning_rate": 9.729086208503173e-07,
      "loss": -0.0,
      "num_tokens": 1875425.0,
      "reward": 0.40937498211860657,
      "reward_std": 0.12255740165710449,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03794642857142857,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 21.267547607421875,
      "learning_rate": 9.672822322997304e-07,
      "loss": 0.0,
      "num_tokens": 1988737.0,
      "reward": 0.28437498211860657,
      "reward_std": 0.26196980476379395,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04017857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 20.922260284423828,
      "learning_rate": 9.611448774886923e-07,
      "loss": -0.0,
      "num_tokens": 2100009.0,
      "reward": 0.7687499523162842,
      "reward_std": 0.1872510462999344,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 59.0,
      "completions/max_terminated_length": 59.0,
      "completions/mean_length": 6.828125,
      "completions/mean_terminated_length": 6.828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04241071428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 22.29154396057129,
      "learning_rate": 9.545032675245813e-07,
      "loss": -0.0,
      "num_tokens": 2228990.0,
      "reward": 0.39374998211860657,
      "reward_std": 0.23356686532497406,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 40.0,
      "completions/max_terminated_length": 40.0,
      "completions/mean_length": 6.53125,
      "completions/mean_terminated_length": 6.53125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.044642857142857144,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 17.501502990722656,
      "learning_rate": 9.473646649103817e-07,
      "loss": 0.0,
      "num_tokens": 2338600.0,
      "reward": 0.4093749523162842,
      "reward_std": 0.19044625759124756,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 7.3125,
      "completions/mean_terminated_length": 7.3125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.046875,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.592903137207031,
      "learning_rate": 9.397368756032444e-07,
      "loss": 0.0,
      "num_tokens": 2423444.0,
      "reward": 0.19999998807907104,
      "reward_std": 0.06365012377500534,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.049107142857142856,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.976240158081055,
      "learning_rate": 9.316282404787869e-07,
      "loss": -0.0,
      "num_tokens": 2520124.0,
      "reward": 0.31562498211860657,
      "reward_std": 0.17782476544380188,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 540.0,
      "completions/max_terminated_length": 540.0,
      "completions/mean_length": 17.328125,
      "completions/mean_terminated_length": 17.328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05133928571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.27862548828125,
      "learning_rate": 9.230476262104676e-07,
      "loss": -0.0,
      "num_tokens": 2650241.0,
      "reward": 0.5812499523162842,
      "reward_std": 0.15520364046096802,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.0,
      "completions/max_terminated_length": 11.0,
      "completions/mean_length": 6.078125,
      "completions/mean_terminated_length": 6.078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05357142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.152713775634766,
      "learning_rate": 9.1400441557401e-07,
      "loss": -0.0,
      "num_tokens": 2770894.0,
      "reward": 0.19062498211860657,
      "reward_std": 0.2198972851037979,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 9.203125,
      "completions/mean_terminated_length": 9.203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05580357142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 29.53569984436035,
      "learning_rate": 9.045084971874737e-07,
      "loss": -0.0,
      "num_tokens": 2841411.0,
      "reward": 0.4406249523162842,
      "reward_std": 0.17782476544380188,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05803571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 23.521568298339844,
      "learning_rate": 8.945702546981968e-07,
      "loss": -0.0,
      "num_tokens": 2926323.0,
      "reward": 0.5031249523162842,
      "reward_std": 0.15992167592048645,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.060267857142857144,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 17.729854583740234,
      "learning_rate": 8.842005554284295e-07,
      "loss": -0.0,
      "num_tokens": 3047355.0,
      "reward": 0.4093749523162842,
      "reward_std": 0.15992167592048645,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.808194160461426,
      "learning_rate": 8.734107384920769e-07,
      "loss": 0.0,
      "num_tokens": 3145515.0,
      "reward": 0.29999998211860657,
      "reward_std": 0.0883883386850357,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06473214285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 18.28078269958496,
      "learning_rate": 8.622126023955445e-07,
      "loss": -0.0,
      "num_tokens": 3238355.0,
      "reward": 0.28437498211860657,
      "reward_std": 0.16887322068214417,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 650.0,
      "completions/max_terminated_length": 650.0,
      "completions/mean_length": 16.0625,
      "completions/mean_terminated_length": 16.0625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06696428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.452601432800293,
      "learning_rate": 8.506183921362442e-07,
      "loss": -0.0,
      "num_tokens": 3350191.0,
      "reward": 0.3781249523162842,
      "reward_std": 0.10205792635679245,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06919642857142858,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 32.80612564086914,
      "learning_rate": 8.386407858128706e-07,
      "loss": -0.0,
      "num_tokens": 3443559.0,
      "reward": 0.33124998211860657,
      "reward_std": 0.21306739747524261,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 10.84375,
      "completions/mean_terminated_length": 10.84375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 21.804208755493164,
      "learning_rate": 8.262928807620843e-07,
      "loss": 0.0,
      "num_tokens": 3553701.0,
      "reward": 0.40937498211860657,
      "reward_std": 0.2109457403421402,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07366071428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.285557746887207,
      "learning_rate": 8.135881792367685e-07,
      "loss": 0.0,
      "num_tokens": 3658485.0,
      "reward": 0.40937498211860657,
      "reward_std": 0.1530819833278656,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07589285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.146930694580078,
      "learning_rate": 8.005405736415125e-07,
      "loss": -0.0,
      "num_tokens": 3753581.0,
      "reward": 0.5031249523162842,
      "reward_std": 0.11100947111845016,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.078125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.522613525390625,
      "learning_rate": 7.871643313414718e-07,
      "loss": -0.0,
      "num_tokens": 3851317.0,
      "reward": 0.4562499523162842,
      "reward_std": 0.0578637532889843,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 29.0,
      "completions/max_terminated_length": 29.0,
      "completions/mean_length": 6.578125,
      "completions/mean_terminated_length": 6.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08035714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 38.662662506103516,
      "learning_rate": 7.734740790612136e-07,
      "loss": 0.0,
      "num_tokens": 3935226.0,
      "reward": 0.34609371423721313,
      "reward_std": 0.20160752534866333,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08258928571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.49333572387695,
      "learning_rate": 7.594847868906076e-07,
      "loss": -0.0,
      "num_tokens": 4054722.0,
      "reward": 0.5968749523162842,
      "reward_std": 0.11100947856903076,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08482142857142858,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 38.19132614135742,
      "learning_rate": 7.452117519152541e-07,
      "loss": 0.0,
      "num_tokens": 4151506.0,
      "reward": 0.7164062261581421,
      "reward_std": 0.15529169142246246,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08705357142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 30.554147720336914,
      "learning_rate": 7.306705814893439e-07,
      "loss": 0.0,
      "num_tokens": 4230210.0,
      "reward": 0.3937499523162842,
      "reward_std": 0.19727614521980286,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08928571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 23.661088943481445,
      "learning_rate": 7.158771761692464e-07,
      "loss": -0.0,
      "num_tokens": 4335138.0,
      "reward": 0.34609371423721313,
      "reward_std": 0.15529169142246246,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09151785714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 32.43943786621094,
      "learning_rate": 7.008477123264847e-07,
      "loss": 0.0,
      "num_tokens": 4439562.0,
      "reward": 0.23749998211860657,
      "reward_std": 0.16675156354904175,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09375,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 13.679961204528809,
      "learning_rate": 6.855986244591103e-07,
      "loss": -0.0,
      "num_tokens": 4555042.0,
      "reward": 0.5031249523162842,
      "reward_std": 0.13258251547813416,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09598214285714286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.701465872208216e-07,
      "loss": 0.0,
      "num_tokens": 4661210.0,
      "reward": 0.4249999523162842,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 7.859375,
      "completions/mean_terminated_length": 7.859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09821428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.7772626876831055,
      "learning_rate": 6.545084971874736e-07,
      "loss": -0.0,
      "num_tokens": 4763577.0,
      "reward": 0.4093749523162842,
      "reward_std": 0.04419417306780815,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10044642857142858,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 29.50361442565918,
      "learning_rate": 6.387014543809223e-07,
      "loss": 0.0,
      "num_tokens": 4850985.0,
      "reward": 0.5499999523162842,
      "reward_std": 0.16675156354904175,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10267857142857142,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.329254150390625,
      "learning_rate": 6.227427435703995e-07,
      "loss": -0.0,
      "num_tokens": 4946729.0,
      "reward": 0.5187499523162842,
      "reward_std": 0.0578637532889843,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10491071428571429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.066498153718734e-07,
      "loss": 0.0,
      "num_tokens": 5027633.0,
      "reward": 0.6749999523162842,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.10714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.90440267166055e-07,
      "loss": 0.0,
      "num_tokens": 5123361.0,
      "reward": 0.5499999523162842,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 6.0,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.109375,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.217447280883789,
      "learning_rate": 5.741318238559209e-07,
      "loss": -0.0,
      "num_tokens": 5215425.0,
      "reward": 0.5187499523162842,
      "reward_std": 0.0578637532889843,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 12.4375,
      "completions/mean_terminated_length": 12.4375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.11160714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 29.34973907470703,
      "learning_rate": 5.577423184847931e-07,
      "loss": 0.0,
      "num_tokens": 5331149.0,
      "reward": 0.26874998211860657,
      "reward_std": 0.1462520956993103,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 50
    }
  ],
  "logging_steps": 1,
  "max_steps": 100,
  "num_input_tokens_seen": 5331149,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}