{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.14285714285714285,
  "eval_steps": 500,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1696.0,
      "completions/mean_length": 508.0,
      "completions/mean_terminated_length": 458.32257080078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.14663860481232405,
      "epoch": 0.0007142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.91739273071289,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 148816.0,
      "reward": 0.27421873807907104,
      "reward_std": 0.4313132166862488,
      "rewards/format_reward/mean": 0.3984375,
      "rewards/format_reward/std": 0.22146137058734894,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 1,
      "step_time": 171.41765936795855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1504.0,
      "completions/max_terminated_length": 1504.0,
      "completions/mean_length": 377.046875,
      "completions/mean_terminated_length": 377.046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.20175037905573845,
      "epoch": 0.0014285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.119359016418457,
      "learning_rate": 5.555555555555555e-08,
      "loss": -0.0,
      "num_tokens": 255907.0,
      "reward": 0.53125,
      "reward_std": 0.5093957781791687,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.2745848298072815,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 2,
      "step_time": 83.64522138307802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 660.625,
      "completions/mean_terminated_length": 638.6032104492188,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "entropy": 0.14103460405021906,
      "epoch": 0.002142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2874624729156494,
      "learning_rate": 1.111111111111111e-07,
      "loss": -0.0,
      "num_tokens": 381059.0,
      "reward": 0.43281248211860657,
      "reward_std": 0.4954730272293091,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.25539806485176086,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 3,
      "step_time": 131.4170093961293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 451.96875,
      "completions/mean_terminated_length": 400.4838562011719,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.19842500798404217,
      "epoch": 0.002857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.316102027893066,
      "learning_rate": 1.6666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 483425.0,
      "reward": 0.24921873211860657,
      "reward_std": 0.4258970022201538,
      "rewards/format_reward/mean": 0.3046875,
      "rewards/format_reward/std": 0.2615155577659607,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 4,
      "step_time": 132.2972059249878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1611.0,
      "completions/max_terminated_length": 1611.0,
      "completions/mean_length": 623.953125,
      "completions/mean_terminated_length": 623.953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.1606526281684637,
      "epoch": 0.0035714285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.722296714782715,
      "learning_rate": 2.222222222222222e-07,
      "loss": -0.0,
      "num_tokens": 604470.0,
      "reward": 0.39531248807907104,
      "reward_std": 0.4883336126804352,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2592533528804779,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 5,
      "step_time": 119.59757148602512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1604.0,
      "completions/mean_length": 401.75,
      "completions/mean_terminated_length": 375.61907958984375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.1858495082706213,
      "epoch": 0.004285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.736406326293945,
      "learning_rate": 2.7777777777777776e-07,
      "loss": -0.0,
      "num_tokens": 713742.0,
      "reward": 0.33281245827674866,
      "reward_std": 0.4670252799987793,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2741328477859497,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 6,
      "step_time": 122.30180106399348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1199.0,
      "completions/max_terminated_length": 1199.0,
      "completions/mean_length": 467.265625,
      "completions/mean_terminated_length": 467.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.16867963038384914,
      "epoch": 0.005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.460102558135986,
      "learning_rate": 3.333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 813639.0,
      "reward": 0.36796873807907104,
      "reward_std": 0.4780389070510864,
      "rewards/format_reward/mean": 0.3984375,
      "rewards/format_reward/std": 0.31090864539146423,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 7,
      "step_time": 73.72938018315472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 508.4375,
      "completions/mean_terminated_length": 484.0000305175781,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.14686184097081423,
      "epoch": 0.005714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.620323181152344,
      "learning_rate": 3.888888888888889e-07,
      "loss": -0.0,
      "num_tokens": 956587.0,
      "reward": 0.25312498211860657,
      "reward_std": 0.42565304040908813,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 8,
      "step_time": 147.8402461669757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1204.0,
      "completions/mean_length": 412.25,
      "completions/mean_terminated_length": 386.2857360839844,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18391940742731094,
      "epoch": 0.0064285714285714285,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 9.683939933776855,
      "learning_rate": 4.444444444444444e-07,
      "loss": -0.0,
      "num_tokens": 1075259.0,
      "reward": 0.3140624761581421,
      "reward_std": 0.4561282992362976,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.2847827076911926,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 9,
      "step_time": 108.42918385588564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1264.0,
      "completions/mean_length": 521.125,
      "completions/mean_terminated_length": 496.888916015625,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.17283021286129951,
      "epoch": 0.007142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.3954830169677734,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 1193907.0,
      "reward": 0.16171874105930328,
      "reward_std": 0.3368554413318634,
      "rewards/format_reward/mean": 0.3671875,
      "rewards/format_reward/std": 0.23974503576755524,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 10,
      "step_time": 118.48989919497399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1067.0,
      "completions/max_terminated_length": 1067.0,
      "completions/mean_length": 442.40625,
      "completions/mean_terminated_length": 442.40625,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "entropy": 0.19687055423855782,
      "epoch": 0.007857142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.561274528503418,
      "learning_rate": 5.555555555555555e-07,
      "loss": 0.0,
      "num_tokens": 1288341.0,
      "reward": 0.30000001192092896,
      "reward_std": 0.44818857312202454,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 11,
      "step_time": 55.07885626098141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1393.0,
      "completions/mean_length": 442.171875,
      "completions/mean_terminated_length": 416.68255615234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.22921719774603844,
      "epoch": 0.008571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.506412506103516,
      "learning_rate": 6.111111111111112e-07,
      "loss": 0.0,
      "num_tokens": 1388512.0,
      "reward": 0.3687499761581421,
      "reward_std": 0.48094648122787476,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.33184191584587097,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 12,
      "step_time": 119.49877157399897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 471.09375,
      "completions/mean_terminated_length": 420.2257995605469,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.14649338461458683,
      "epoch": 0.009285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.722030639648438,
      "learning_rate": 6.666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1511798.0,
      "reward": 0.22499997913837433,
      "reward_std": 0.40029749274253845,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.26726123690605164,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 13,
      "step_time": 160.80015432706568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1346.0,
      "completions/mean_length": 409.171875,
      "completions/mean_terminated_length": 356.3064270019531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.14898105338215828,
      "epoch": 0.01,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 38.745059967041016,
      "learning_rate": 7.222222222222221e-07,
      "loss": -0.0,
      "num_tokens": 1651729.0,
      "reward": 0.2789062261581421,
      "reward_std": 0.4511716961860657,
      "rewards/format_reward/mean": 0.2890625,
      "rewards/format_reward/std": 0.2789533734321594,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 14,
      "step_time": 193.12235332495766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1252.0,
      "completions/max_terminated_length": 1252.0,
      "completions/mean_length": 466.265625,
      "completions/mean_terminated_length": 466.265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.19658867083489895,
      "epoch": 0.010714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 34.59446334838867,
      "learning_rate": 7.777777777777778e-07,
      "loss": 0.0,
      "num_tokens": 1746826.0,
      "reward": 0.23281247913837433,
      "reward_std": 0.4091433882713318,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.30496877431869507,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 15,
      "step_time": 68.50137870694743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1516.0,
      "completions/mean_length": 417.140625,
      "completions/mean_terminated_length": 364.5322570800781,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.15556670725345612,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 6.721803188323975,
      "learning_rate": 8.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1870859.0,
      "reward": 0.34062498807907104,
      "reward_std": 0.4673358201980591,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.3149704039096832,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 16,
      "step_time": 159.5833295909688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1605.0,
      "completions/mean_length": 444.390625,
      "completions/mean_terminated_length": 392.6612854003906,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18389248382300138,
      "epoch": 0.012142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25290298461914,
      "learning_rate": 8.888888888888888e-07,
      "loss": 0.0,
      "num_tokens": 1987036.0,
      "reward": 0.16093748807907104,
      "reward_std": 0.34462639689445496,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.301698237657547,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 17,
      "step_time": 163.66844313696492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1489.0,
      "completions/mean_length": 497.984375,
      "completions/mean_terminated_length": 447.9838562011719,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.2151591945439577,
      "epoch": 0.012857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 49.3928108215332,
      "learning_rate": 9.444444444444444e-07,
      "loss": -0.0,
      "num_tokens": 2084659.0,
      "reward": 0.32343748211860657,
      "reward_std": 0.4590334892272949,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.29839184880256653,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 18,
      "step_time": 127.23080269095954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1315.0,
      "completions/max_terminated_length": 1315.0,
      "completions/mean_length": 358.28125,
      "completions/mean_terminated_length": 358.28125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.16461750492453575,
      "epoch": 0.013571428571428571,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 33.24488830566406,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 2189413.0,
      "reward": 0.21406248211860657,
      "reward_std": 0.3865258991718292,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.2221602201461792,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 19,
      "step_time": 81.95023821806535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 443.28125,
      "completions/mean_terminated_length": 417.8095397949219,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.16434035263955593,
      "epoch": 0.014285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.118695259094238,
      "learning_rate": 9.999776148326214e-07,
      "loss": -0.0,
      "num_tokens": 2326511.0,
      "reward": 0.42656248807907104,
      "reward_std": 0.48713353276252747,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1985812783241272,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 20,
      "step_time": 171.49558448110474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1352.0,
      "completions/max_terminated_length": 1352.0,
      "completions/mean_length": 286.140625,
      "completions/mean_terminated_length": 286.140625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2154129333794117,
      "epoch": 0.015,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 16.506973266601562,
      "learning_rate": 9.999104613348689e-07,
      "loss": -0.0,
      "num_tokens": 2431592.0,
      "reward": 0.33203125,
      "reward_std": 0.45828935503959656,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.18881812691688538,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 21,
      "step_time": 102.2051269490039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 981.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 351.015625,
      "completions/mean_terminated_length": 351.015625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2384468913078308,
      "epoch": 0.015714285714285715,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 8.052404403686523,
      "learning_rate": 9.997985455197113e-07,
      "loss": -0.0,
      "num_tokens": 2517985.0,
      "reward": 0.20859375596046448,
      "reward_std": 0.37886154651641846,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.28770697116851807,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 22,
      "step_time": 47.53120892500738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1639.0,
      "completions/max_terminated_length": 1639.0,
      "completions/mean_length": 493.734375,
      "completions/mean_terminated_length": 493.734375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.15667208284139633,
      "epoch": 0.016428571428571428,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 4.182269096374512,
      "learning_rate": 9.996418774081656e-07,
      "loss": 0.0,
      "num_tokens": 2643640.0,
      "reward": 0.2679687440395355,
      "reward_std": 0.41770032048225403,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.22699186205863953,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 23,
      "step_time": 136.94159113999922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1209.0,
      "completions/max_terminated_length": 1209.0,
      "completions/mean_length": 334.75,
      "completions/mean_terminated_length": 334.75,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2605742085725069,
      "epoch": 0.017142857142857144,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 2.6181070804595947,
      "learning_rate": 9.994404710283998e-07,
      "loss": 0.0,
      "num_tokens": 2743904.0,
      "reward": 0.08046875149011612,
      "reward_std": 0.17719532549381256,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.24384792149066925,
      "rewards/mcq_exact_match_reward/mean": 0.03125,
      "rewards/mcq_exact_match_reward/std": 0.17536810040473938,
      "step": 24,
      "step_time": 67.75085840100655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 991.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 167.71875,
      "completions/mean_terminated_length": 167.71875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2089465633034706,
      "epoch": 0.017857142857142856,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 21.222145080566406,
      "learning_rate": 9.991943444144756e-07,
      "loss": -0.0,
      "num_tokens": 2839630.0,
      "reward": 0.3820312023162842,
      "reward_std": 0.4708458185195923,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 25,
      "step_time": 46.71443971898407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 692.0,
      "completions/max_terminated_length": 692.0,
      "completions/mean_length": 217.5,
      "completions/mean_terminated_length": 217.5,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.25668232701718807,
      "epoch": 0.018571428571428572,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 17.222293853759766,
      "learning_rate": 9.989035196047348e-07,
      "loss": -0.0,
      "num_tokens": 2927590.0,
      "reward": 0.16249999403953552,
      "reward_std": 0.3169797956943512,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.109375,
      "rewards/mcq_exact_match_reward/std": 0.3145764470100403,
      "step": 26,
      "step_time": 41.34394903801149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 324.53125,
      "completions/mean_terminated_length": 297.17462158203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18609545193612576,
      "epoch": 0.019285714285714285,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 17.366985321044922,
      "learning_rate": 9.98568022639826e-07,
      "loss": 0.0,
      "num_tokens": 3043752.0,
      "reward": 0.28359371423721313,
      "reward_std": 0.431318998336792,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.1406387835741043,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 27,
      "step_time": 145.95341787295183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1195.0,
      "completions/max_terminated_length": 1195.0,
      "completions/mean_length": 215.234375,
      "completions/mean_terminated_length": 215.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2353730145841837,
      "epoch": 0.02,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 17.270282745361328,
      "learning_rate": 9.981878835603716e-07,
      "loss": 0.0,
      "num_tokens": 3131783.0,
      "reward": 0.27578121423721313,
      "reward_std": 0.4189927279949188,
      "rewards/format_reward/mean": 0.5703125,
      "rewards/format_reward/std": 0.1751912236213684,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 28,
      "step_time": 48.64892271097051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 865.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 214.15625,
      "completions/mean_terminated_length": 214.15625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2909251060336828,
      "epoch": 0.020714285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.325923919677734,
      "learning_rate": 9.977631364042794e-07,
      "loss": -0.0,
      "num_tokens": 3226177.0,
      "reward": 0.4117187261581421,
      "reward_std": 0.4837634861469269,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.1649840772151947,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 29,
      "step_time": 55.44644622900523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1167.0,
      "completions/mean_length": 308.875,
      "completions/mean_terminated_length": 281.2698669433594,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2629696223884821,
      "epoch": 0.02142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.076370239257812,
      "learning_rate": 9.972938192036944e-07,
      "loss": 0.0,
      "num_tokens": 3343833.0,
      "reward": 0.27421873807907104,
      "reward_std": 0.4146132171154022,
      "rewards/format_reward/mean": 0.5546875,
      "rewards/format_reward/std": 0.26899561285972595,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 30,
      "step_time": 177.31874076800887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 520.0,
      "completions/max_terminated_length": 520.0,
      "completions/mean_length": 236.171875,
      "completions/mean_terminated_length": 236.171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2660892754793167,
      "epoch": 0.02214285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.033559799194336,
      "learning_rate": 9.967799739815924e-07,
      "loss": 0.0,
      "num_tokens": 3407684.0,
      "reward": 0.4117187261581421,
      "reward_std": 0.4798099994659424,
      "rewards/format_reward/mean": 0.6796875,
      "rewards/format_reward/std": 0.30035942792892456,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 31,
      "step_time": 19.77746521908557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 762.0,
      "completions/mean_length": 267.140625,
      "completions/mean_terminated_length": 238.87303161621094,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2127502802759409,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 21.78681182861328,
      "learning_rate": 9.96221646748019e-07,
      "loss": -0.0,
      "num_tokens": 3501853.0,
      "reward": 0.390625,
      "reward_std": 0.47525057196617126,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.26726123690605164,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 32,
      "step_time": 118.76398004795192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1112.0,
      "completions/max_terminated_length": 1112.0,
      "completions/mean_length": 195.171875,
      "completions/mean_terminated_length": 195.171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.26218850910663605,
      "epoch": 0.023571428571428573,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 13.382572174072266,
      "learning_rate": 9.956188874959686e-07,
      "loss": 0.0,
      "num_tokens": 3603568.0,
      "reward": 0.19062499701976776,
      "reward_std": 0.3306888937950134,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 33,
      "step_time": 57.80779201700352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 203.953125,
      "completions/mean_terminated_length": 203.953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.21508901193737984,
      "epoch": 0.024285714285714285,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 18.717557907104492,
      "learning_rate": 9.949717501969079e-07,
      "loss": 0.0,
      "num_tokens": 3688533.0,
      "reward": 0.5679687261581421,
      "reward_std": 0.5023252964019775,
      "rewards/format_reward/mean": 0.6796875,
      "rewards/format_reward/std": 0.27265870571136475,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 34,
      "step_time": 64.80360024399124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1353.0,
      "completions/max_terminated_length": 1353.0,
      "completions/mean_length": 254.78125,
      "completions/mean_terminated_length": 254.78125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.27256614714860916,
      "epoch": 0.025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.232938766479492,
      "learning_rate": 9.942802927959442e-07,
      "loss": -0.0,
      "num_tokens": 3775567.0,
      "reward": 0.38124996423721313,
      "reward_std": 0.46908387541770935,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.3149704039096832,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 35,
      "step_time": 69.50821864098543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1524.0,
      "completions/max_terminated_length": 1524.0,
      "completions/mean_length": 279.328125,
      "completions/mean_terminated_length": 279.328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.21617292240262032,
      "epoch": 0.025714285714285714,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 14.788715362548828,
      "learning_rate": 9.93544577206636e-07,
      "loss": 0.0,
      "num_tokens": 3873788.0,
      "reward": 0.24140623211860657,
      "reward_std": 0.38684260845184326,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.2762732207775116,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 36,
      "step_time": 112.71978642407339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1100.0,
      "completions/max_terminated_length": 1100.0,
      "completions/mean_length": 204.046875,
      "completions/mean_terminated_length": 204.046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2291890811175108,
      "epoch": 0.02642857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 22.162996292114258,
      "learning_rate": 9.927646693054495e-07,
      "loss": 0.0,
      "num_tokens": 3949719.0,
      "reward": 0.43906253576278687,
      "reward_std": 0.4866037666797638,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.2777281701564789,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 37,
      "step_time": 62.183381506067235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 675.0,
      "completions/max_terminated_length": 675.0,
      "completions/mean_length": 92.578125,
      "completions/mean_terminated_length": 92.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.21844332106411457,
      "epoch": 0.027142857142857142,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 21.4798641204834,
      "learning_rate": 9.919406389258606e-07,
      "loss": -0.0,
      "num_tokens": 4028188.0,
      "reward": 0.44453126192092896,
      "reward_std": 0.486411988735199,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.2615155577659607,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 38,
      "step_time": 44.90790424309671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 877.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 136.25,
      "completions/mean_terminated_length": 136.25,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2506138999015093,
      "epoch": 0.027857142857142858,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 27.89171028137207,
      "learning_rate": 9.910725598521012e-07,
      "loss": -0.0,
      "num_tokens": 4097708.0,
      "reward": 0.4664062261581421,
      "reward_std": 0.4868961274623871,
      "rewards/format_reward/mean": 0.9140625,
      "rewards/format_reward/std": 0.19012710452079773,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 39,
      "step_time": 39.88727585604647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 999.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 136.984375,
      "completions/mean_terminated_length": 136.984375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.20815920643508434,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 22.442346572875977,
      "learning_rate": 9.901605098125526e-07,
      "loss": -0.0,
      "num_tokens": 4190579.0,
      "reward": 0.38749998807907104,
      "reward_std": 0.4662412703037262,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.26726123690605164,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 40,
      "step_time": 62.169976764998864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 59.859375,
      "completions/mean_terminated_length": 59.859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2938700430095196,
      "epoch": 0.029285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 11.49561882019043,
      "learning_rate": 9.892045704727863e-07,
      "loss": -0.0,
      "num_tokens": 4283034.0,
      "reward": 0.16562500596046448,
      "reward_std": 0.2750000059604645,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.078125,
      "rewards/mcq_exact_match_reward/std": 0.27048972249031067,
      "step": 41,
      "step_time": 32.6582014990272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 861.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 75.765625,
      "completions/mean_terminated_length": 75.765625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.25041171722114086,
      "epoch": 0.03,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 30.742996215820312,
      "learning_rate": 9.882048274282505e-07,
      "loss": 0.0,
      "num_tokens": 4361843.0,
      "reward": 0.57421875,
      "reward_std": 0.5030062198638916,
      "rewards/format_reward/mean": 0.8984375,
      "rewards/format_reward/std": 0.20275264978408813,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 42,
      "step_time": 47.323365143092815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 646.0,
      "completions/max_terminated_length": 646.0,
      "completions/mean_length": 47.390625,
      "completions/mean_terminated_length": 47.390625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18909681774675846,
      "epoch": 0.030714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 22.67137908935547,
      "learning_rate": 9.871613701966066e-07,
      "loss": 0.0,
      "num_tokens": 4457780.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.485504150390625,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.233588308095932,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 43,
      "step_time": 47.4528916090494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 554.0,
      "completions/mean_length": 83.21875,
      "completions/mean_terminated_length": 52.0317497253418,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.17218941450119019,
      "epoch": 0.03142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.08895492553711,
      "learning_rate": 9.86074292209714e-07,
      "loss": 0.0,
      "num_tokens": 4527074.0,
      "reward": 0.628125011920929,
      "reward_std": 0.50661301612854,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 44,
      "step_time": 107.07343659299659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 651.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 36.0,
      "completions/mean_terminated_length": 36.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18100928142666817,
      "epoch": 0.03214285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 13.066922187805176,
      "learning_rate": 9.849436908052636e-07,
      "loss": 0.0,
      "num_tokens": 4608754.0,
      "reward": 0.38203126192092896,
      "reward_std": 0.465761661529541,
      "rewards/format_reward/mean": 0.8515625,
      "rewards/format_reward/std": 0.26246222853660583,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 45,
      "step_time": 30.311806608980987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 837.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 67.34375,
      "completions/mean_terminated_length": 67.34375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.19344223476946354,
      "epoch": 0.032857142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 8.720047950744629,
      "learning_rate": 9.837696672180618e-07,
      "loss": 0.0,
      "num_tokens": 4691800.0,
      "reward": 0.328125,
      "reward_std": 0.4157489538192749,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 46,
      "step_time": 44.673023908922914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 509.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 20.140625,
      "completions/mean_terminated_length": 20.140625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.07546021463349462,
      "epoch": 0.03357142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.505743980407715,
      "learning_rate": 9.825523265709665e-07,
      "loss": -0.0,
      "num_tokens": 4783617.0,
      "reward": 0.71875,
      "reward_std": 0.49629583954811096,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 47,
      "step_time": 28.177909465972334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 20.59375,
      "completions/mean_terminated_length": 20.59375,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "entropy": 0.10830738116055727,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.707826614379883,
      "learning_rate": 9.812917778654747e-07,
      "loss": 0.0,
      "num_tokens": 4861247.0,
      "reward": 0.43125003576278687,
      "reward_std": 0.4888843894004822,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 48,
      "step_time": 10.893696008017287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 749.0,
      "completions/mean_length": 66.125,
      "completions/mean_terminated_length": 34.66666793823242,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.11854543350636959,
      "epoch": 0.035,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.387039184570312,
      "learning_rate": 9.799881339719614e-07,
      "loss": 0.0,
      "num_tokens": 4968215.0,
      "reward": 0.5382812023162842,
      "reward_std": 0.504876434803009,
      "rewards/format_reward/mean": 0.8515625,
      "rewards/format_reward/std": 0.24688033759593964,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 49,
      "step_time": 154.59990453493083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1531.0,
      "completions/max_terminated_length": 1531.0,
      "completions/mean_length": 77.125,
      "completions/mean_terminated_length": 77.125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.09524603839963675,
      "epoch": 0.03571428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 10.865472793579102,
      "learning_rate": 9.786415116195732e-07,
      "loss": 0.0,
      "num_tokens": 5049023.0,
      "reward": 0.44218751788139343,
      "reward_std": 0.47993209958076477,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 50,
      "step_time": 96.41551529400749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 14.734375,
      "completions/mean_terminated_length": 14.734375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.09114427305758,
      "epoch": 0.03642857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 37.82780838012695,
      "learning_rate": 9.772520313857775e-07,
      "loss": 0.0,
      "num_tokens": 5134110.0,
      "reward": 0.5960937738418579,
      "reward_std": 0.5080545544624329,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.13524486124515533,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 51,
      "step_time": 4.045462172012776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 59.0,
      "completions/max_terminated_length": 59.0,
      "completions/mean_length": 13.75,
      "completions/mean_terminated_length": 13.75,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.10188040044158697,
      "epoch": 0.037142857142857144,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.1320161819458,
      "learning_rate": 9.758198176855646e-07,
      "loss": 0.0,
      "num_tokens": 5203342.0,
      "reward": 0.5992187857627869,
      "reward_std": 0.5032033920288086,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 52,
      "step_time": 3.415803858079016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 120.0,
      "completions/max_terminated_length": 120.0,
      "completions/mean_length": 15.71875,
      "completions/mean_terminated_length": 15.71875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.08301922678947449,
      "epoch": 0.03785714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 10.11668586730957,
      "learning_rate": 9.74344998760308e-07,
      "loss": -0.0,
      "num_tokens": 5300012.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 53,
      "step_time": 12.412089038116392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.07525204867124557,
      "epoch": 0.03857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.29720401763916,
      "learning_rate": 9.72827706666282e-07,
      "loss": -0.0,
      "num_tokens": 5393076.0,
      "reward": 0.5835937857627869,
      "reward_std": 0.504507839679718,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 54,
      "step_time": 4.425214122980833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.05632809503003955,
      "epoch": 0.039285714285714285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.864079475402832,
      "learning_rate": 9.712680772628363e-07,
      "loss": 0.0,
      "num_tokens": 5464420.0,
      "reward": 0.4281250238418579,
      "reward_std": 0.4732423424720764,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 55,
      "step_time": 3.023675933131017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 83.0,
      "completions/max_terminated_length": 83.0,
      "completions/mean_length": 14.203125,
      "completions/mean_terminated_length": 14.203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.06479678908362985,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 18.5594539642334,
      "learning_rate": 9.696662502002318e-07,
      "loss": -0.0,
      "num_tokens": 5535681.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.49501484632492065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 56,
      "step_time": 6.536966418905649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 73.0,
      "completions/max_terminated_length": 73.0,
      "completions/mean_length": 14.734375,
      "completions/mean_terminated_length": 14.734375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.11025835108011961,
      "epoch": 0.04071428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.760197639465332,
      "learning_rate": 9.680223689071362e-07,
      "loss": 0.0,
      "num_tokens": 5615464.0,
      "reward": 0.39531251788139343,
      "reward_std": 0.4615982174873352,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 57,
      "step_time": 6.5071131670847535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 53.28125,
      "completions/mean_terminated_length": 21.619049072265625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.060628769686445594,
      "epoch": 0.041428571428571426,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 7.574483871459961,
      "learning_rate": 9.663365805777814e-07,
      "loss": 0.0,
      "num_tokens": 5694530.0,
      "reward": 0.37812501192092896,
      "reward_std": 0.45546722412109375,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 58,
      "step_time": 121.47771771694534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 18.5,
      "completions/mean_terminated_length": 18.5,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.04920098069123924,
      "epoch": 0.04214285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.911975622177124,
      "learning_rate": 9.646090361587827e-07,
      "loss": -0.0,
      "num_tokens": 5767930.0,
      "reward": 0.6937500238418579,
      "reward_std": 0.49501484632492065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 59,
      "step_time": 10.687484149995726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 19.34375,
      "completions/mean_terminated_length": 19.34375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.07043586298823357,
      "epoch": 0.04285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 19.942119598388672,
      "learning_rate": 9.628398903356239e-07,
      "loss": 0.0,
      "num_tokens": 5856232.0,
      "reward": 0.612500011920929,
      "reward_std": 0.5023753046989441,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.12198751419782639,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 60,
      "step_time": 12.981740556890145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.043852697126567364,
      "epoch": 0.04357142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 19.423234939575195,
      "learning_rate": 9.610293015188067e-07,
      "loss": 0.0,
      "num_tokens": 5911984.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 61,
      "step_time": 2.5916157929459587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0222997268429026,
      "epoch": 0.04428571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.171859741210938,
      "learning_rate": 9.59177431829666e-07,
      "loss": 0.0,
      "num_tokens": 5999016.0,
      "reward": 0.8656250238418579,
      "reward_std": 0.42695629596710205,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.765625,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 62,
      "step_time": 4.844649164064322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 651.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 29.203125,
      "completions/mean_terminated_length": 29.203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.06459272187203169,
      "epoch": 0.045,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.733697891235352,
      "learning_rate": 9.572844470858537e-07,
      "loss": 0.0,
      "num_tokens": 6067573.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.502967357635498,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 63,
      "step_time": 33.15641678700922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0357388777192682,
      "epoch": 0.045714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.319589614868164,
      "learning_rate": 9.55350516786491e-07,
      "loss": 0.0,
      "num_tokens": 6119109.0,
      "reward": 0.7093750238418579,
      "reward_std": 0.4917473793029785,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 64,
      "step_time": 2.4223899890785106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.017213885730598122,
      "epoch": 0.04642857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.899584770202637,
      "learning_rate": 9.533758140969912e-07,
      "loss": 0.0,
      "num_tokens": 6189037.0,
      "reward": 0.8812500238418579,
      "reward_std": 0.4166666567325592,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.78125,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 65,
      "step_time": 2.9868784029968083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 19.328125,
      "completions/mean_terminated_length": 19.328125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.042392246425151825,
      "epoch": 0.047142857142857146,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 23.965177536010742,
      "learning_rate": 9.513605158335562e-07,
      "loss": -0.0,
      "num_tokens": 6279914.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 66,
      "step_time": 31.981172394065652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0195480928523466,
      "epoch": 0.047857142857142855,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.419790744781494,
      "learning_rate": 9.493048024473411e-07,
      "loss": 0.0,
      "num_tokens": 6366018.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.502967357635498,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 67,
      "step_time": 4.065540662908461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.03548452723771334,
      "epoch": 0.04857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.068404197692871,
      "learning_rate": 9.47208858008299e-07,
      "loss": 0.0,
      "num_tokens": 6458322.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 68,
      "step_time": 4.998500798013993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.03951962455175817,
      "epoch": 0.04928571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 17.25354766845703,
      "learning_rate": 9.450728701886983e-07,
      "loss": -0.0,
      "num_tokens": 6536578.0,
      "reward": 0.7359374761581421,
      "reward_std": 0.4903407096862793,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 69,
      "step_time": 3.523755496018566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.024328175000846386,
      "epoch": 0.05,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 16.914518356323242,
      "learning_rate": 9.428970302463184e-07,
      "loss": 0.0,
      "num_tokens": 6607098.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 70,
      "step_time": 3.8031876169261523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.04464244609698653,
      "epoch": 0.05071428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.606903076171875,
      "learning_rate": 9.406815330073244e-07,
      "loss": 0.0,
      "num_tokens": 6687066.0,
      "reward": 0.3656250238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 71,
      "step_time": 4.856895222037565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.010779005533549935,
      "epoch": 0.05142857142857143,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.3981804847717285,
      "learning_rate": 9.384265768488224e-07,
      "loss": -0.0,
      "num_tokens": 6763770.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 72,
      "step_time": 3.7144640430342406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.027491965098306537,
      "epoch": 0.052142857142857144,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.469144821166992,
      "learning_rate": 9.36132363681097e-07,
      "loss": 0.0,
      "num_tokens": 6836978.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.502967357635498,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 73,
      "step_time": 3.3146990051609464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.00936318637104705,
      "epoch": 0.05285714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.384286880493164,
      "learning_rate": 9.337990989295304e-07,
      "loss": 0.0,
      "num_tokens": 6908506.0,
      "reward": 0.9125000238418579,
      "reward_std": 0.39339789748191833,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.8125,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 74,
      "step_time": 3.9301627399399877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.015574420220218599,
      "epoch": 0.05357142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.314269915162114e-07,
      "loss": 0.0,
      "num_tokens": 6982170.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 75,
      "step_time": 4.287910231039859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.012644122180063277,
      "epoch": 0.054285714285714284,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.279651641845703,
      "learning_rate": 9.290162538412255e-07,
      "loss": -0.0,
      "num_tokens": 7076114.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 76,
      "step_time": 6.48582560592331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.043263796949759126,
      "epoch": 0.055,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.609958648681641,
      "learning_rate": 9.265671017636382e-07,
      "loss": 0.0,
      "num_tokens": 7180538.0,
      "reward": 0.5843750238418579,
      "reward_std": 0.5037065148353577,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 77,
      "step_time": 6.952459908090532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02222421654732898,
      "epoch": 0.055714285714285716,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 8.704394340515137,
      "learning_rate": 9.240797545821666e-07,
      "loss": -0.0,
      "num_tokens": 7261706.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 78,
      "step_time": 3.9689354329602793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02659488166682422,
      "epoch": 0.056428571428571425,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.201833724975586,
      "learning_rate": 9.215544350155422e-07,
      "loss": 0.0,
      "num_tokens": 7335362.0,
      "reward": 0.4437500238418579,
      "reward_std": 0.4787135720252991,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 79,
      "step_time": 3.920655517023988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.010249783445033245,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.3185906410217285,
      "learning_rate": 9.189913691825699e-07,
      "loss": 0.0,
      "num_tokens": 7403978.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.5037065148353577,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 80,
      "step_time": 3.4547118460177444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009724527830258012,
      "epoch": 0.05785714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.341253280639648,
      "learning_rate": 9.163907865818806e-07,
      "loss": -0.0,
      "num_tokens": 7468082.0,
      "reward": 0.6937500238418579,
      "reward_std": 0.49501484632492065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 81,
      "step_time": 2.828022911970038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.021473060944117606,
      "epoch": 0.05857142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.137529200713809e-07,
      "loss": 0.0,
      "num_tokens": 7558546.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 82,
      "step_time": 5.028618018957786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0026043455000035465,
      "epoch": 0.05928571428571429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.11078005847405e-07,
      "loss": 0.0,
      "num_tokens": 7625834.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 83,
      "step_time": 3.222042798937764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.022887271596118808,
      "epoch": 0.06,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.373490333557129,
      "learning_rate": 9.083662834235629e-07,
      "loss": 0.0,
      "num_tokens": 7684242.0,
      "reward": 0.5218750238418579,
      "reward_std": 0.49776285886764526,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 84,
      "step_time": 2.5565447990666144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01285930466838181,
      "epoch": 0.060714285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.25327205657959,
      "learning_rate": 9.056179956092961e-07,
      "loss": -0.0,
      "num_tokens": 7786874.0,
      "reward": 0.4906250238418579,
      "reward_std": 0.4917473793029785,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 85,
      "step_time": 7.1175413559540175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 36.0,
      "completions/max_terminated_length": 36.0,
      "completions/mean_length": 13.453125,
      "completions/mean_terminated_length": 13.453125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.04580933507531881,
      "epoch": 0.06142857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 29.327259063720703,
      "learning_rate": 9.028333884881356e-07,
      "loss": -0.0,
      "num_tokens": 7870471.0,
      "reward": 0.7984374761581421,
      "reward_std": 0.46819213032722473,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 86,
      "step_time": 4.2775687840767205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0068673123605549335,
      "epoch": 0.062142857142857146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.000127113956672e-07,
      "loss": 0.0,
      "num_tokens": 7966495.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 87,
      "step_time": 4.236654673120938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.006318100189673714,
      "epoch": 0.06285714285714286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.971562168972064e-07,
      "loss": 0.0,
      "num_tokens": 8055943.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 88,
      "step_time": 7.718285386974458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0103716982412152,
      "epoch": 0.06357142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.37955665588379,
      "learning_rate": 8.942641607651828e-07,
      "loss": -0.0,
      "num_tokens": 8159495.0,
      "reward": 0.6625000238418579,
      "reward_std": 0.5,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 89,
      "step_time": 4.891622756083962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019456376787275076,
      "epoch": 0.06428571428571428,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 39.22451400756836,
      "learning_rate": 8.91336801956239e-07,
      "loss": 0.0,
      "num_tokens": 8238415.0,
      "reward": 0.6625000238418579,
      "reward_std": 0.5,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 90,
      "step_time": 3.36728476092685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02106736705172807,
      "epoch": 0.065,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 42.52458953857422,
      "learning_rate": 8.883744025880427e-07,
      "loss": 0.0,
      "num_tokens": 8310247.0,
      "reward": 0.8343750238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.734375,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 91,
      "step_time": 2.880418001965154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.00979969812760828,
      "epoch": 0.06571428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.711299896240234,
      "learning_rate": 8.853772279158165e-07,
      "loss": 0.0,
      "num_tokens": 8387591.0,
      "reward": 0.9125000238418579,
      "reward_std": 0.39339789748191833,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.8125,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 92,
      "step_time": 5.434788639016915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.002452172411722131,
      "epoch": 0.06642857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.823455463085873e-07,
      "loss": 0.0,
      "num_tokens": 8481191.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 93,
      "step_time": 4.279522054013796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.024404613999649882,
      "epoch": 0.06714285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 49.90077209472656,
      "learning_rate": 8.792796292251559e-07,
      "loss": -0.0,
      "num_tokens": 8552663.0,
      "reward": 0.5218750238418579,
      "reward_std": 0.49776285886764526,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 94,
      "step_time": 2.9446488179964945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 15.171875,
      "completions/mean_terminated_length": 15.171875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.015569199862511596,
      "epoch": 0.06785714285714285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.761797511897906e-07,
      "loss": 0.0,
      "num_tokens": 8643242.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 95,
      "step_time": 12.633399382932112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01959893899038434,
      "epoch": 0.06857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 35.42424011230469,
      "learning_rate": 8.730461897676463e-07,
      "loss": 0.0,
      "num_tokens": 8715210.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.502967357635498,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 96,
      "step_time": 5.476754333998542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 15.5,
      "completions/mean_terminated_length": 15.5,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019250252342317253,
      "epoch": 0.06928571428571428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.698792255399103e-07,
      "loss": 0.0,
      "num_tokens": 8802986.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 97,
      "step_time": 10.231531332945451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.014249574625864625,
      "epoch": 0.07,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 26.534008026123047,
      "learning_rate": 8.666791420786803e-07,
      "loss": -0.0,
      "num_tokens": 8887802.0,
      "reward": 0.4125000238418579,
      "reward_std": 0.4671765863895416,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 98,
      "step_time": 4.172440590918995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007108512945706025,
      "epoch": 0.07071428571428572,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.561801910400391,
      "learning_rate": 8.634462259215718e-07,
      "loss": 0.0,
      "num_tokens": 8959266.0,
      "reward": 0.8343750238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.734375,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 99,
      "step_time": 3.1397473260294646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.012571636820212007,
      "epoch": 0.07142857142857142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.601807665460619e-07,
      "loss": 0.0,
      "num_tokens": 9056234.0,
      "reward": 0.4750000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 100,
      "step_time": 7.120428283000365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.011042158876080066,
      "epoch": 0.07214285714285715,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.18372917175293,
      "learning_rate": 8.568830563435694e-07,
      "loss": 0.0,
      "num_tokens": 9131162.0,
      "reward": 0.9281250238418579,
      "reward_std": 0.38025420904159546,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.828125,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 101,
      "step_time": 3.5142359259189107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.018726955458987504,
      "epoch": 0.07285714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 22.103269577026367,
      "learning_rate": 8.535533905932737e-07,
      "loss": 0.0,
      "num_tokens": 9213466.0,
      "reward": 0.5375000238418579,
      "reward_std": 0.5,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 102,
      "step_time": 6.039401607995387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.002115752373356372,
      "epoch": 0.07357142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.501920674356754e-07,
      "loss": 0.0,
      "num_tokens": 9300234.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 103,
      "step_time": 6.536880514177028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.023664406850002706,
      "epoch": 0.07428571428571429,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.414724349975586,
      "learning_rate": 8.467993878459003e-07,
      "loss": 0.0,
      "num_tokens": 9408458.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 104,
      "step_time": 5.406510353961494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.011180884641362354,
      "epoch": 0.075,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.74150276184082,
      "learning_rate": 8.433756556067505e-07,
      "loss": 0.0,
      "num_tokens": 9522706.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.4787135720252991,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 105,
      "step_time": 6.74972949095536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.022577963944058865,
      "epoch": 0.07571428571428572,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 66.94438934326172,
      "learning_rate": 8.399211772815029e-07,
      "loss": -0.0,
      "num_tokens": 9605874.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.49501484632492065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 106,
      "step_time": 5.258289148041513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.025772404042072594,
      "epoch": 0.07642857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 16.906095504760742,
      "learning_rate": 8.364362621864594e-07,
      "loss": -0.0,
      "num_tokens": 9674906.0,
      "reward": 0.7875000238418579,
      "reward_std": 0.4671765863895416,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.6875,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 107,
      "step_time": 3.7881211650092155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009171675716061145,
      "epoch": 0.07714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.32921222363251e-07,
      "loss": 0.0,
      "num_tokens": 9753194.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 108,
      "step_time": 3.6917542329756543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007731896243058145,
      "epoch": 0.07785714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 21.57496452331543,
      "learning_rate": 8.293763725508969e-07,
      "loss": -0.0,
      "num_tokens": 9832178.0,
      "reward": 0.3656250238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 109,
      "step_time": 3.5990826380439103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.00748697979724966,
      "epoch": 0.07857142857142857,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.718206405639648,
      "learning_rate": 8.258020301576223e-07,
      "loss": 0.0,
      "num_tokens": 9929978.0,
      "reward": 0.6468750238418579,
      "reward_std": 0.501733124256134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 110,
      "step_time": 5.109096220054198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007316715433262289,
      "epoch": 0.07928571428571428,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.581790924072266,
      "learning_rate": 8.221985152324384e-07,
      "loss": -0.0,
      "num_tokens": 9994906.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 111,
      "step_time": 3.6931803559418768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0037167306727496907,
      "epoch": 0.08,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.185661504364844e-07,
      "loss": 0.0,
      "num_tokens": 10068282.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 112,
      "step_time": 3.9287309639621526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007747493364149705,
      "epoch": 0.08071428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.363574504852295,
      "learning_rate": 8.149052610141355e-07,
      "loss": 0.0,
      "num_tokens": 10147770.0,
      "reward": 0.8656250238418579,
      "reward_std": 0.42695629596710205,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.765625,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 113,
      "step_time": 4.5494631649926305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.010833295003976673,
      "epoch": 0.08142857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.112161747638821e-07,
      "loss": 0.0,
      "num_tokens": 10226570.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 114,
      "step_time": 4.508432603033725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007666514196898788,
      "epoch": 0.08214285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.074992220089768e-07,
      "loss": 0.0,
      "num_tokens": 10295866.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 115,
      "step_time": 3.0693239220418036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.022078550304286182,
      "epoch": 0.08285714285714285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.685462951660156,
      "learning_rate": 8.037547355678576e-07,
      "loss": 0.0,
      "num_tokens": 10365162.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 116,
      "step_time": 3.2104193790000863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007084596261847764,
      "epoch": 0.08357142857142857,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.003612518310547,
      "learning_rate": 7.999830507243477e-07,
      "loss": -0.0,
      "num_tokens": 10440450.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 117,
      "step_time": 4.254953389870934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.018427874660119414,
      "epoch": 0.08428571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 39.15703582763672,
      "learning_rate": 7.961845051976332e-07,
      "loss": 0.0,
      "num_tokens": 10527178.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 118,
      "step_time": 6.791951371007599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1207.0,
      "completions/max_terminated_length": 1207.0,
      "completions/mean_length": 31.65625,
      "completions/mean_terminated_length": 31.65625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01668112922925502,
      "epoch": 0.085,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.928234100341797,
      "learning_rate": 7.923594391120236e-07,
      "loss": 0.0,
      "num_tokens": 10618972.0,
      "reward": 0.6937500238418579,
      "reward_std": 0.49501484632492065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 119,
      "step_time": 82.62583560397616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.010110943403560668,
      "epoch": 0.08571428571428572,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.88508194966497e-07,
      "loss": 0.0,
      "num_tokens": 10692188.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 120,
      "step_time": 4.424513722071424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009336426679510623,
      "epoch": 0.08642857142857142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.84631117604033e-07,
      "loss": 0.0,
      "num_tokens": 10782300.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 121,
      "step_time": 5.603050055215135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.021290896052960306,
      "epoch": 0.08714285714285715,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.509204864501953,
      "learning_rate": 7.80728554180734e-07,
      "loss": 0.0,
      "num_tokens": 10883556.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.5037065148353577,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 122,
      "step_time": 6.613117044093087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0029609822304337285,
      "epoch": 0.08785714285714286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.768008541347421e-07,
      "loss": 0.0,
      "num_tokens": 10960532.0,
      "reward": 1.100000023841858,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 1.0,
      "rewards/mcq_exact_match_reward/std": 0.0,
      "step": 123,
      "step_time": 3.7264017250272445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.00125310622388497,
      "epoch": 0.08857142857142856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.728483691549491e-07,
      "loss": 0.0,
      "num_tokens": 11053804.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 124,
      "step_time": 4.139133257966023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.004454300171346404,
      "epoch": 0.08928571428571429,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.340372562408447,
      "learning_rate": 7.688714531495059e-07,
      "loss": 0.0,
      "num_tokens": 11132692.0,
      "reward": 0.7093750238418579,
      "reward_std": 0.4917473793029785,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 125,
      "step_time": 5.186251167091541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019100001431070268,
      "epoch": 0.09,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.68722915649414,
      "learning_rate": 7.648704622141347e-07,
      "loss": 0.0,
      "num_tokens": 11205652.0,
      "reward": 0.6468750238418579,
      "reward_std": 0.501733124256134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 126,
      "step_time": 4.047589352878276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.014429281174670905,
      "epoch": 0.09071428571428572,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.554597854614258,
      "learning_rate": 7.608457546002422e-07,
      "loss": 0.0,
      "num_tokens": 11305180.0,
      "reward": 0.4906250238418579,
      "reward_std": 0.4917473793029785,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 127,
      "step_time": 6.020642440940719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01491769595304504,
      "epoch": 0.09142857142857143,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.746049880981445,
      "learning_rate": 7.56797690682843e-07,
      "loss": 0.0,
      "num_tokens": 11379044.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.46049273014068604,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 128,
      "step_time": 4.16096327296691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 109.0,
      "completions/max_terminated_length": 109.0,
      "completions/mean_length": 14.5,
      "completions/mean_terminated_length": 14.5,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.03012668783776462,
      "epoch": 0.09214285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.020849227905273,
      "learning_rate": 7.527266329282905e-07,
      "loss": 0.0,
      "num_tokens": 11445012.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.4787135720252991,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 129,
      "step_time": 7.198684796865564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.020660966634750366,
      "epoch": 0.09285714285714286,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.086788177490234,
      "learning_rate": 7.486329458618215e-07,
      "loss": 0.0,
      "num_tokens": 11537204.0,
      "reward": 0.6468750238418579,
      "reward_std": 0.501733124256134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 130,
      "step_time": 4.54672675288748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0055168012622743845,
      "epoch": 0.09357142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.445169960349166e-07,
      "loss": 0.0,
      "num_tokens": 11631108.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 131,
      "step_time": 4.3755096909590065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009784101857803762,
      "epoch": 0.09428571428571429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.403791519924793e-07,
      "loss": 0.0,
      "num_tokens": 11703972.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 132,
      "step_time": 3.261271098861471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.008812747604679316,
      "epoch": 0.095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.362197842398354e-07,
      "loss": 0.0,
      "num_tokens": 11767676.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 133,
      "step_time": 3.028584598971065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 44.0,
      "completions/max_terminated_length": 44.0,
      "completions/mean_length": 13.484375,
      "completions/mean_terminated_length": 13.484375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.027414096985012293,
      "epoch": 0.09571428571428571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.320392652095583e-07,
      "loss": 0.0,
      "num_tokens": 11849683.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 134,
      "step_time": 3.9485261590452865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 16.5625,
      "completions/mean_terminated_length": 16.5625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.03702096623601392,
      "epoch": 0.09642857142857143,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.03097152709961,
      "learning_rate": 7.278379692281208e-07,
      "loss": 0.0,
      "num_tokens": 11930367.0,
      "reward": 0.6781250238418579,
      "reward_std": 0.49776285886764526,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 135,
      "step_time": 16.50799851596821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 474.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 20.203125,
      "completions/mean_terminated_length": 20.203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.016940920031629503,
      "epoch": 0.09714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.236162724823778e-07,
      "loss": 0.0,
      "num_tokens": 11996020.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 136,
      "step_time": 20.552247615996748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 26.609375,
      "completions/mean_terminated_length": 26.609375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.07297810423187912,
      "epoch": 0.09785714285714285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.540674209594727,
      "learning_rate": 7.193745529858826e-07,
      "loss": 0.0,
      "num_tokens": 12075187.0,
      "reward": 0.550000011920929,
      "reward_std": 0.5048966407775879,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 137,
      "step_time": 24.535814730101265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.016214959672652185,
      "epoch": 0.09857142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.151131905450385e-07,
      "loss": 0.0,
      "num_tokens": 12165891.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 138,
      "step_time": 4.597669165057596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 90.0,
      "completions/max_terminated_length": 90.0,
      "completions/mean_length": 14.203125,
      "completions/mean_terminated_length": 14.203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.04903080174699426,
      "epoch": 0.09928571428571428,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 34.56901931762695,
      "learning_rate": 7.10832566725092e-07,
      "loss": 0.0,
      "num_tokens": 12231904.0,
      "reward": 0.6468750238418579,
      "reward_std": 0.501733124256134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 139,
      "step_time": 6.9713660419220105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0055509630183223635,
      "epoch": 0.1,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.065330648159655e-07,
      "loss": 0.0,
      "num_tokens": 12303128.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 140,
      "step_time": 3.502849594980944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01160202972823754,
      "epoch": 0.10071428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 21.754417419433594,
      "learning_rate": 7.022150697979384e-07,
      "loss": -0.0,
      "num_tokens": 12391456.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 141,
      "step_time": 4.62359821901191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 24.484375,
      "completions/mean_terminated_length": 24.484375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0657657328993082,
      "epoch": 0.10142857142857142,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 40.662166595458984,
      "learning_rate": 6.978789683071759e-07,
      "loss": -0.0,
      "num_tokens": 12475671.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 142,
      "step_time": 17.122637529973872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 751.0,
      "completions/mean_length": 56.3125,
      "completions/mean_terminated_length": 24.698413848876953,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.02439082640921697,
      "epoch": 0.10214285714285715,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.452528953552246,
      "learning_rate": 6.935251486011086e-07,
      "loss": 0.0,
      "num_tokens": 12552059.0,
      "reward": 0.5984375476837158,
      "reward_std": 0.5056795477867126,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 143,
      "step_time": 121.78867189295124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 17.328125,
      "completions/mean_terminated_length": 17.328125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.042446967447176576,
      "epoch": 0.10285714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 21.53173828125,
      "learning_rate": 6.891540005236674e-07,
      "loss": -0.0,
      "num_tokens": 12635208.0,
      "reward": 0.6617187857627869,
      "reward_std": 0.5009310245513916,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 144,
      "step_time": 14.861859488883056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009935209178365767,
      "epoch": 0.10357142857142858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.847659154703785e-07,
      "loss": 0.0,
      "num_tokens": 12719712.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 145,
      "step_time": 5.540668106987141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 616.0,
      "completions/max_terminated_length": 616.0,
      "completions/mean_length": 84.890625,
      "completions/mean_terminated_length": 84.890625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.11740031838417053,
      "epoch": 0.10428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 8.078269004821777,
      "learning_rate": 6.803612863533149e-07,
      "loss": 0.0,
      "num_tokens": 12805409.0,
      "reward": 0.8656250238418579,
      "reward_std": 0.42695629596710205,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.765625,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 146,
      "step_time": 31.698010974912904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 792.0,
      "completions/mean_length": 61.859375,
      "completions/mean_terminated_length": 30.333335876464844,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.045849967980757356,
      "epoch": 0.105,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.473923206329346,
      "learning_rate": 6.759405075659165e-07,
      "loss": 0.0,
      "num_tokens": 12893416.0,
      "reward": 0.846875011920929,
      "reward_std": 0.44220542907714844,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 147,
      "step_time": 150.5390612690244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 687.0,
      "completions/max_terminated_length": 687.0,
      "completions/mean_length": 46.71875,
      "completions/mean_terminated_length": 46.71875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.16094551188871264,
      "epoch": 0.10571428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 6.536345958709717,
      "learning_rate": 6.715039749476763e-07,
      "loss": 0.0,
      "num_tokens": 12942342.0,
      "reward": 0.6453125476837158,
      "reward_std": 0.5036153793334961,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 148,
      "step_time": 26.220934735029005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 15.984375,
      "completions/mean_terminated_length": 15.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.03024544403888285,
      "epoch": 0.10642857142857143,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.6902215480804443,
      "learning_rate": 6.670520857486949e-07,
      "loss": -0.0,
      "num_tokens": 13035949.0,
      "reward": 0.5992187857627869,
      "reward_std": 0.5032033920288086,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 149,
      "step_time": 11.029159978032112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 728.0,
      "completions/max_terminated_length": 728.0,
      "completions/mean_length": 33.375,
      "completions/mean_terminated_length": 33.375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.06737930839881301,
      "epoch": 0.10714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.0915071964263916,
      "learning_rate": 6.625852385941118e-07,
      "loss": 0.0,
      "num_tokens": 13118629.0,
      "reward": 0.8804687857627869,
      "reward_std": 0.41629672050476074,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.78125,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 150,
      "step_time": 37.43122749996837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 23.0625,
      "completions/mean_terminated_length": 23.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.10096252337098122,
      "epoch": 0.10785714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.689016342163086,
      "learning_rate": 6.58103833448412e-07,
      "loss": 0.0,
      "num_tokens": 13209761.0,
      "reward": 0.8343750238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.734375,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 151,
      "step_time": 7.120647723902948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 538.0,
      "completions/max_terminated_length": 538.0,
      "completions/mean_length": 45.453125,
      "completions/mean_terminated_length": 45.453125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09933191817253828,
      "epoch": 0.10857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 13.227392196655273,
      "learning_rate": 6.536082715796124e-07,
      "loss": 0.0,
      "num_tokens": 13282486.0,
      "reward": 0.7546875476837158,
      "reward_std": 0.48104703426361084,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 152,
      "step_time": 24.34091979288496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 781.0,
      "completions/mean_length": 102.734375,
      "completions/mean_terminated_length": 71.85714721679688,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.10967912850901484,
      "epoch": 0.10928571428571429,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.153820037841797,
      "learning_rate": 6.490989555233327e-07,
      "loss": -0.0,
      "num_tokens": 13371077.0,
      "reward": 0.753125011920929,
      "reward_std": 0.48336413502693176,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 153,
      "step_time": 129.29081673000474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.038476365618407726,
      "epoch": 0.11,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.445762890467517e-07,
      "loss": 0.0,
      "num_tokens": 13481541.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 154,
      "step_time": 7.0913085790816694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 57.0,
      "completions/max_terminated_length": 57.0,
      "completions/mean_length": 13.625,
      "completions/mean_terminated_length": 13.625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.03220844024326652,
      "epoch": 0.11071428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.658990383148193,
      "learning_rate": 6.400406771124535e-07,
      "loss": 0.0,
      "num_tokens": 13553333.0,
      "reward": 0.8343750238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.734375,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 155,
      "step_time": 5.22538055095356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.0,
      "completions/max_terminated_length": 121.0,
      "completions/mean_length": 14.875,
      "completions/mean_terminated_length": 14.875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.04669280140660703,
      "epoch": 0.11142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.39362335205078,
      "learning_rate": 6.354925258421675e-07,
      "loss": 0.0,
      "num_tokens": 13637053.0,
      "reward": 0.8796875476837158,
      "reward_std": 0.41602033376693726,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.78125,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 156,
      "step_time": 8.888459036010318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.022164398804306984,
      "epoch": 0.11214285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.90540313720703,
      "learning_rate": 6.309322424804033e-07,
      "loss": 0.0,
      "num_tokens": 13729052.0,
      "reward": 0.5218750238418579,
      "reward_std": 0.49776285886764526,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 157,
      "step_time": 3.9538759248680435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.028873756295070052,
      "epoch": 0.11285714285714285,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.955380916595459,
      "learning_rate": 6.263602353579866e-07,
      "loss": -0.0,
      "num_tokens": 13799450.0,
      "reward": 0.9593750238418579,
      "reward_std": 0.3503824472427368,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.859375,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 158,
      "step_time": 4.26783420908032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.00713545671897009,
      "epoch": 0.11357142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.217769138554959e-07,
      "loss": 0.0,
      "num_tokens": 13871953.0,
      "reward": 1.100000023841858,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 1.0,
      "rewards/mcq_exact_match_reward/std": 0.0,
      "step": 159,
      "step_time": 2.712456647946965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.013005010900087655,
      "epoch": 0.11428571428571428,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.743955612182617,
      "learning_rate": 6.171826883666074e-07,
      "loss": 0.0,
      "num_tokens": 13973689.0,
      "reward": 0.5843750238418579,
      "reward_std": 0.5037065148353577,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 160,
      "step_time": 6.412190584058408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 13.015625,
      "completions/mean_terminated_length": 13.015625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02211562287993729,
      "epoch": 0.115,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.129695892333984,
      "learning_rate": 6.12577970261347e-07,
      "loss": -0.0,
      "num_tokens": 14055802.0,
      "reward": 0.7242187857627869,
      "reward_std": 0.487379789352417,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 161,
      "step_time": 4.479429450002499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019997738883830607,
      "epoch": 0.11571428571428571,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.165828704833984,
      "learning_rate": 6.079631718492568e-07,
      "loss": -0.0,
      "num_tokens": 14166330.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.5037065148353577,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 162,
      "step_time": 7.445554191886913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.027288892189972103,
      "epoch": 0.11642857142857142,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.685976028442383,
      "learning_rate": 6.033387063424764e-07,
      "loss": 0.0,
      "num_tokens": 14258536.0,
      "reward": 0.7875000238418579,
      "reward_std": 0.4671765863895416,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.6875,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 163,
      "step_time": 7.0269312839373015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 407.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 19.15625,
      "completions/mean_terminated_length": 19.15625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.03532049781642854,
      "epoch": 0.11714285714285715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.565665245056152,
      "learning_rate": 5.987049878187436e-07,
      "loss": -0.0,
      "num_tokens": 14344858.0,
      "reward": 0.7710937857627869,
      "reward_std": 0.47440895438194275,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 164,
      "step_time": 21.54071432899218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.016357870190404356,
      "epoch": 0.11785714285714285,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 16.555530548095703,
      "learning_rate": 5.940624311843168e-07,
      "loss": 0.0,
      "num_tokens": 14429170.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.46049273014068604,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 165,
      "step_time": 4.7839336470351554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.026364905992522836,
      "epoch": 0.11857142857142858,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.869507789611816,
      "learning_rate": 5.894114521368258e-07,
      "loss": -0.0,
      "num_tokens": 14490984.0,
      "reward": 0.9437500238418579,
      "reward_std": 0.36596253514289856,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.84375,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 166,
      "step_time": 3.4966191770508885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 17.46875,
      "completions/mean_terminated_length": 17.46875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.013556473655626178,
      "epoch": 0.11928571428571429,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.16694974899292,
      "learning_rate": 5.847524671280483e-07,
      "loss": 0.0,
      "num_tokens": 14553334.0,
      "reward": 0.6148437857627869,
      "reward_std": 0.5029815435409546,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 167,
      "step_time": 12.092128862044774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.013614867959404364,
      "epoch": 0.12,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.871785163879395,
      "learning_rate": 5.800858933266212e-07,
      "loss": 0.0,
      "num_tokens": 14642670.0,
      "reward": 0.9437500238418579,
      "reward_std": 0.36596253514289856,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.84375,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 168,
      "step_time": 4.104719978873618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.01443118933821097,
      "epoch": 0.12071428571428572,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.75412148580687e-07,
      "loss": 0.0,
      "num_tokens": 14710006.0,
      "reward": 0.3500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 169,
      "step_time": 3.3311328379204497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019871412543579936,
      "epoch": 0.12142857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 24.622060775756836,
      "learning_rate": 5.707316513804792e-07,
      "loss": 0.0,
      "num_tokens": 14791358.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 170,
      "step_time": 6.276687276025768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.007149734243284911,
      "epoch": 0.12214285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.660448208208513e-07,
      "loss": 0.0,
      "num_tokens": 14861150.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 171,
      "step_time": 3.8035881727701053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 15.703125,
      "completions/mean_terminated_length": 15.703125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.015912733739241958,
      "epoch": 0.12285714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 19.102737426757812,
      "learning_rate": 5.613520765637489e-07,
      "loss": -0.0,
      "num_tokens": 14942923.0,
      "reward": 0.9437500238418579,
      "reward_std": 0.36596253514289856,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.84375,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 172,
      "step_time": 8.8402068670257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.953125,
      "completions/mean_terminated_length": 12.953125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.021281153662130237,
      "epoch": 0.12357142857142857,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.939657688140869,
      "learning_rate": 5.56653838800635e-07,
      "loss": -0.0,
      "num_tokens": 15030208.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 173,
      "step_time": 4.201213694992475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 440.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 25.578125,
      "completions/mean_terminated_length": 25.578125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.045478251413442194,
      "epoch": 0.12428571428571429,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.5842310190200806,
      "learning_rate": 5.519505282148643e-07,
      "loss": -0.0,
      "num_tokens": 15113453.0,
      "reward": 1.0499999523162842,
      "reward_std": 0.22730302810668945,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.1510545015335083,
      "rewards/mcq_exact_match_reward/mean": 0.953125,
      "rewards/mcq_exact_match_reward/std": 0.21304203569889069,
      "step": 174,
      "step_time": 28.228189032059163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.01783720776438713,
      "epoch": 0.125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 21.070640563964844,
      "learning_rate": 5.472425659440156e-07,
      "loss": -0.0,
      "num_tokens": 15178379.0,
      "reward": 0.9906250238418579,
      "reward_std": 0.3145764470100403,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.890625,
      "rewards/mcq_exact_match_reward/std": 0.3145764470100403,
      "step": 175,
      "step_time": 3.1726534390472807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02504683134611696,
      "epoch": 0.12571428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 20.849273681640625,
      "learning_rate": 5.425303735421828e-07,
      "loss": 0.0,
      "num_tokens": 15256923.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 176,
      "step_time": 5.022981332032941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.01540215959539637,
      "epoch": 0.12642857142857142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.378143729422284e-07,
      "loss": 0.0,
      "num_tokens": 15365457.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 177,
      "step_time": 5.918834337033331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.008651553944218904,
      "epoch": 0.12714285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.7155351638793945,
      "learning_rate": 5.330949864180033e-07,
      "loss": -0.0,
      "num_tokens": 15443911.0,
      "reward": 1.0992188453674316,
      "reward_std": 0.006250003352761269,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 1.0,
      "rewards/mcq_exact_match_reward/std": 0.0,
      "step": 178,
      "step_time": 4.0626735190162435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 17.5625,
      "completions/mean_terminated_length": 17.5625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.0304659788380377,
      "epoch": 0.12785714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 9.790291786193848,
      "learning_rate": 5.28372636546537e-07,
      "loss": 0.0,
      "num_tokens": 15519371.0,
      "reward": 0.23906251788139343,
      "reward_std": 0.35124140977859497,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 179,
      "step_time": 11.25525397103047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.01876044215168804,
      "epoch": 0.12857142857142856,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.506415367126465,
      "learning_rate": 5.236477461701985e-07,
      "loss": 0.0,
      "num_tokens": 15607905.0,
      "reward": 0.8343750238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.734375,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 180,
      "step_time": 4.205256605986506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.90625,
      "completions/mean_terminated_length": 12.90625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.029853320098482072,
      "epoch": 0.12928571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.465096473693848,
      "learning_rate": 5.189207383588352e-07,
      "loss": 0.0,
      "num_tokens": 15706555.0,
      "reward": 0.6781250238418579,
      "reward_std": 0.49776285886764526,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 181,
      "step_time": 7.094486114161555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.90625,
      "completions/mean_terminated_length": 12.90625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.010658063692972064,
      "epoch": 0.13,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.141920363718916e-07,
      "loss": 0.0,
      "num_tokens": 15784853.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.3333333432674408,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 182,
      "step_time": 3.412773276970256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0029293073748704046,
      "epoch": 0.13071428571428573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.094620636205095e-07,
      "loss": 0.0,
      "num_tokens": 15856085.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 183,
      "step_time": 2.87448824493913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.015074294526129961,
      "epoch": 0.13142857142857142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.047312436296158e-07,
      "loss": 0.0,
      "num_tokens": 15911093.0,
      "reward": 0.4750000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 184,
      "step_time": 2.8168845549225807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.009781015396583825,
      "epoch": 0.13214285714285715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 15982949.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 185,
      "step_time": 3.729889392852783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 13.015625,
      "completions/mean_terminated_length": 13.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.023459993302822113,
      "epoch": 0.13285714285714287,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.2294769287109375,
      "learning_rate": 4.952687563703841e-07,
      "loss": 0.0,
      "num_tokens": 16079918.0,
      "reward": 0.5984375476837158,
      "reward_std": 0.5056795477867126,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 186,
      "step_time": 5.194811234949157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 13.015625,
      "completions/mean_terminated_length": 13.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.005734891252359375,
      "epoch": 0.13357142857142856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.381458282470703,
      "learning_rate": 4.905379363794906e-07,
      "loss": -0.0,
      "num_tokens": 16171215.0,
      "reward": 0.6765625476837158,
      "reward_std": 0.49975937604904175,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 187,
      "step_time": 4.580101553059649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.96875,
      "completions/mean_terminated_length": 12.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.0026500039821257815,
      "epoch": 0.13428571428571429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.858079636281084e-07,
      "loss": 0.0,
      "num_tokens": 16262173.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 188,
      "step_time": 5.5456535129924305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.0,
      "completions/max_terminated_length": 18.0,
      "completions/mean_length": 13.09375,
      "completions/mean_terminated_length": 13.09375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.011145705298986286,
      "epoch": 0.135,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.990577697753906,
      "learning_rate": 4.810792616411649e-07,
      "loss": 0.0,
      "num_tokens": 16320587.0,
      "reward": 0.971875011920929,
      "reward_std": 0.34201493859291077,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 189,
      "step_time": 2.8638349280226976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.02099756433744915,
      "epoch": 0.1357142857142857,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.941129684448242,
      "learning_rate": 4.763522538298017e-07,
      "loss": -0.0,
      "num_tokens": 16402163.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 190,
      "step_time": 4.850232388009317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 13.03125,
      "completions/mean_terminated_length": 13.03125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.00562438897031825,
      "epoch": 0.13642857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.7162736345346296e-07,
      "loss": 0.0,
      "num_tokens": 16483949.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 191,
      "step_time": 3.6737312379991636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.010754016373539343,
      "epoch": 0.13714285714285715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.6690501358199655e-07,
      "loss": 0.0,
      "num_tokens": 16578652.0,
      "reward": 0.3500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 192,
      "step_time": 7.894912258896511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.011054494883865118,
      "epoch": 0.13785714285714284,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 21.005765914916992,
      "learning_rate": 4.621856270577718e-07,
      "loss": -0.0,
      "num_tokens": 16646828.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.4836103618144989,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 193,
      "step_time": 2.648993079084903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.00612640570034273,
      "epoch": 0.13857142857142857,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.35145378112793,
      "learning_rate": 4.5746962645781723e-07,
      "loss": 0.0,
      "num_tokens": 16719476.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.46049273014068604,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 194,
      "step_time": 3.3733782949857414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.01801892218645662,
      "epoch": 0.1392857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.5275743405598437e-07,
      "loss": 0.0,
      "num_tokens": 16806395.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 195,
      "step_time": 7.701647555048112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.020205819397233427,
      "epoch": 0.14,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 32.03440475463867,
      "learning_rate": 4.480494717851358e-07,
      "loss": 0.0,
      "num_tokens": 16883851.0,
      "reward": 0.6468750238418579,
      "reward_std": 0.501733124256134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 196,
      "step_time": 3.548996380006429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.019142711884342134,
      "epoch": 0.1407142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.11782455444336,
      "learning_rate": 4.433461611993651e-07,
      "loss": -0.0,
      "num_tokens": 16950395.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.4531635046005249,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 197,
      "step_time": 3.6711106749717146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.0016078357584774494,
      "epoch": 0.14142857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.3864792343625115e-07,
      "loss": 0.0,
      "num_tokens": 17058795.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5039526224136353,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 198,
      "step_time": 7.064584863022901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.018027375219389796,
      "epoch": 0.14214285714285715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 44.21977996826172,
      "learning_rate": 4.3395517917914894e-07,
      "loss": 0.0,
      "num_tokens": 17140074.0,
      "reward": 0.6625000238418579,
      "reward_std": 0.5,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 199,
      "step_time": 4.233198134053964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.984375,
      "completions/mean_terminated_length": 12.984375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.02401040424592793,
      "epoch": 0.14285714285714285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.2926834861952077e-07,
      "loss": 0.0,
      "num_tokens": 17224545.0,
      "reward": 0.3500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 200,
      "step_time": 4.817044052877463
    }
  ],
  "logging_steps": 1,
  "max_steps": 350,
  "num_input_tokens_seen": 17224545,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}