golden-goose-qwen2.5-1.5b-i…/checkpoint-50/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.03571428571428571,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1221.0,
      "completions/max_terminated_length": 1221.0,
      "completions/mean_length": 429.265625,
      "completions/mean_terminated_length": 429.265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.16631191410124302,
      "epoch": 0.0007142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.064172744750977,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 106993.0,
      "reward": 0.3062499761581421,
      "reward_std": 0.44636982679367065,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.279951810836792,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 1,
      "step_time": 58.5344306009938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1353.0,
      "completions/max_terminated_length": 1353.0,
      "completions/mean_length": 318.9375,
      "completions/mean_terminated_length": 318.9375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.21383497677743435,
      "epoch": 0.0014285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.98560905456543,
      "learning_rate": 5.555555555555555e-08,
      "loss": -0.0,
      "num_tokens": 215429.0,
      "reward": 0.1718749850988388,
      "reward_std": 0.3545480966567993,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.2745848298072815,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 2,
      "step_time": 88.76631601905683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1401.0,
      "completions/mean_length": 473.1875,
      "completions/mean_terminated_length": 448.19049072265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18973157368600368,
      "epoch": 0.002142857142857143,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 4.975986957550049,
      "learning_rate": 1.111111111111111e-07,
      "loss": 0.0,
      "num_tokens": 328561.0,
      "reward": 0.26093751192092896,
      "reward_std": 0.421775221824646,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.2847827076911926,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 3,
      "step_time": 165.18509875505697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1586.0,
      "completions/mean_length": 572.875,
      "completions/mean_terminated_length": 525.290283203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.16460688412189484,
      "epoch": 0.002857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.808328628540039,
      "learning_rate": 1.6666666666666665e-07,
      "loss": -0.0,
      "num_tokens": 457417.0,
      "reward": 0.3304687440395355,
      "reward_std": 0.4703609347343445,
      "rewards/format_reward/mean": 0.3359375,
      "rewards/format_reward/std": 0.29620200395584106,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 4,
      "step_time": 144.26409805112053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1520.0,
      "completions/max_terminated_length": 1520.0,
      "completions/mean_length": 487.84375,
      "completions/mean_terminated_length": 487.84375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.1548436339944601,
      "epoch": 0.0035714285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.682651996612549,
      "learning_rate": 2.222222222222222e-07,
      "loss": 0.0,
      "num_tokens": 581367.0,
      "reward": 0.26640623807907104,
      "reward_std": 0.43771177530288696,
      "rewards/format_reward/mean": 0.3203125,
      "rewards/format_reward/std": 0.27265870571136475,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 5,
      "step_time": 108.35025227611186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1734.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 504.1875,
      "completions/mean_terminated_length": 504.1875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.15498177334666252,
      "epoch": 0.004285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 19.60171890258789,
      "learning_rate": 2.7777777777777776e-07,
      "loss": 0.0,
      "num_tokens": 697603.0,
      "reward": 0.15312498807907104,
      "reward_std": 0.3424786627292633,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 6,
      "step_time": 101.33141891699051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1490.0,
      "completions/max_terminated_length": 1490.0,
      "completions/mean_length": 490.046875,
      "completions/mean_terminated_length": 490.046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.16023985855281353,
      "epoch": 0.005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.83193302154541,
      "learning_rate": 3.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 815102.0,
      "reward": 0.21953123807907104,
      "reward_std": 0.4048923850059509,
      "rewards/format_reward/mean": 0.3203125,
      "rewards/format_reward/std": 0.27265870571136475,
      "rewards/mcq_exact_match_reward/mean": 0.1875,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 7,
      "step_time": 91.50709407392424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 898.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 304.890625,
      "completions/mean_terminated_length": 304.890625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "entropy": 0.24000362865626812,
      "epoch": 0.005714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.5482072830200195,
      "learning_rate": 3.888888888888889e-07,
      "loss": -0.0,
      "num_tokens": 898271.0,
      "reward": 0.37343746423721313,
      "reward_std": 0.4891658127307892,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.2630521357059479,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 8,
      "step_time": 39.65540377004072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1663.0,
      "completions/mean_length": 562.890625,
      "completions/mean_terminated_length": 539.3175048828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.16113737598061562,
      "epoch": 0.0064285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.5696187019348145,
      "learning_rate": 4.444444444444444e-07,
      "loss": -0.0,
      "num_tokens": 1037680.0,
      "reward": 0.3976562023162842,
      "reward_std": 0.48798495531082153,
      "rewards/format_reward/mean": 0.3828125,
      "rewards/format_reward/std": 0.21347814798355103,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 9,
      "step_time": 141.6887187999091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1540.0,
      "completions/mean_length": 482.828125,
      "completions/mean_terminated_length": 457.9841613769531,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.13189083151519299,
      "epoch": 0.007142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.855482578277588,
      "learning_rate": 5e-07,
      "loss": -0.0,
      "num_tokens": 1177501.0,
      "reward": 0.2890625,
      "reward_std": 0.44351306557655334,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.2592533528804779,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 10,
      "step_time": 166.28699472307926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1330.0,
      "completions/max_terminated_length": 1330.0,
      "completions/mean_length": 475.71875,
      "completions/mean_terminated_length": 475.71875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.1734474077820778,
      "epoch": 0.007857142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.398857593536377,
      "learning_rate": 5.555555555555555e-07,
      "loss": 0.0,
      "num_tokens": 1285235.0,
      "reward": 0.3296874761581421,
      "reward_std": 0.46408456563949585,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.29839184880256653,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 11,
      "step_time": 84.22253790113609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1220.0,
      "completions/mean_length": 472.0,
      "completions/mean_terminated_length": 421.1612854003906,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.1755824889987707,
      "epoch": 0.008571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.043524265289307,
      "learning_rate": 6.111111111111112e-07,
      "loss": -0.0,
      "num_tokens": 1402707.0,
      "reward": 0.3609374761581421,
      "reward_std": 0.4861958622932434,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.31140682101249695,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 12,
      "step_time": 147.3180411880021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1790.0,
      "completions/max_terminated_length": 1790.0,
      "completions/mean_length": 390.375,
      "completions/mean_terminated_length": 390.375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "entropy": 0.18040168471634388,
      "epoch": 0.009285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.297653198242188,
      "learning_rate": 6.666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1499635.0,
      "reward": 0.34296876192092896,
      "reward_std": 0.4763341546058655,
      "rewards/format_reward/mean": 0.4609375,
      "rewards/format_reward/std": 0.37059250473976135,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 13,
      "step_time": 114.5199738269439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1307.0,
      "completions/mean_length": 537.03125,
      "completions/mean_terminated_length": 488.2903137207031,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.17947025410830975,
      "epoch": 0.01,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.441692352294922,
      "learning_rate": 7.222222222222221e-07,
      "loss": -0.0,
      "num_tokens": 1617149.0,
      "reward": 0.1929687261581421,
      "reward_std": 0.3684875965118408,
      "rewards/format_reward/mean": 0.3671875,
      "rewards/format_reward/std": 0.28510910272598267,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 14,
      "step_time": 151.2594413299812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1614.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 591.671875,
      "completions/mean_terminated_length": 591.671875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.14635740965604782,
      "epoch": 0.010714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.28996467590332,
      "learning_rate": 7.777777777777778e-07,
      "loss": -0.0,
      "num_tokens": 1737472.0,
      "reward": 0.43437498807907104,
      "reward_std": 0.49728426337242126,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.22712838649749756,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 15,
      "step_time": 89.8077824919601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1520.0,
      "completions/max_terminated_length": 1520.0,
      "completions/mean_length": 394.375,
      "completions/mean_terminated_length": 394.375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.14997188560664654,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.825170516967773,
      "learning_rate": 8.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1841536.0,
      "reward": 0.44218748807907104,
      "reward_std": 0.5008895993232727,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.24346621334552765,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 16,
      "step_time": 76.37220047204755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1665.0,
      "completions/mean_length": 405.4375,
      "completions/mean_terminated_length": 379.3651123046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.1900953222066164,
      "epoch": 0.012142857142857143,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 4.9255475997924805,
      "learning_rate": 8.888888888888888e-07,
      "loss": 0.0,
      "num_tokens": 1953596.0,
      "reward": 0.29296875,
      "reward_std": 0.444888710975647,
      "rewards/format_reward/mean": 0.4296875,
      "rewards/format_reward/std": 0.26528194546699524,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 17,
      "step_time": 201.1936074459809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1469.0,
      "completions/max_terminated_length": 1469.0,
      "completions/mean_length": 503.375,
      "completions/mean_terminated_length": 503.375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.16892211325466633,
      "epoch": 0.012857142857142857,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.7092769145965576,
      "learning_rate": 9.444444444444444e-07,
      "loss": 0.0,
      "num_tokens": 2075684.0,
      "reward": 0.39374998211860657,
      "reward_std": 0.4857901334762573,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 18,
      "step_time": 85.88162007700885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1472.0,
      "completions/max_terminated_length": 1472.0,
      "completions/mean_length": 405.21875,
      "completions/mean_terminated_length": 405.21875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.14752909820526838,
      "epoch": 0.013571428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.623274803161621,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 2192466.0,
      "reward": 0.3187499940395355,
      "reward_std": 0.45114490389823914,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.2357022762298584,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 19,
      "step_time": 92.52301802794682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1522.0,
      "completions/mean_length": 510.96875,
      "completions/mean_terminated_length": 461.3870849609375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.19505906477570534,
      "epoch": 0.014285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.067420482635498,
      "learning_rate": 9.999776148326214e-07,
      "loss": 0.0,
      "num_tokens": 2310224.0,
      "reward": 0.17656250298023224,
      "reward_std": 0.33117976784706116,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.23517554998397827,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 20,
      "step_time": 171.9173401860171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1016.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 405.171875,
      "completions/mean_terminated_length": 405.171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.20128028839826584,
      "epoch": 0.015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.80737590789795,
      "learning_rate": 9.999104613348689e-07,
      "loss": -0.0,
      "num_tokens": 2409627.0,
      "reward": 0.32343748211860657,
      "reward_std": 0.4627358913421631,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.33592742681503296,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 21,
      "step_time": 53.99907385505503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1459.0,
      "completions/mean_length": 482.375,
      "completions/mean_terminated_length": 457.5238342285156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.16458906698971987,
      "epoch": 0.015714285714285715,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 4.145486831665039,
      "learning_rate": 9.997985455197113e-07,
      "loss": -0.0,
      "num_tokens": 2518611.0,
      "reward": 0.35078126192092896,
      "reward_std": 0.46423619985580444,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.3249503970146179,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 22,
      "step_time": 158.2987147619715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 971.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 269.3125,
      "completions/mean_terminated_length": 269.3125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.21560950204730034,
      "epoch": 0.016428571428571428,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 27.24405288696289,
      "learning_rate": 9.996418774081656e-07,
      "loss": -0.0,
      "num_tokens": 2592975.0,
      "reward": 0.41484373807907104,
      "reward_std": 0.4857231378555298,
      "rewards/format_reward/mean": 0.7109375,
      "rewards/format_reward/std": 0.2928335666656494,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 23,
      "step_time": 46.8515038289479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1421.0,
      "completions/max_terminated_length": 1421.0,
      "completions/mean_length": 252.953125,
      "completions/mean_terminated_length": 252.953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.19199753180146217,
      "epoch": 0.017142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.405986785888672,
      "learning_rate": 9.994404710283998e-07,
      "loss": 0.0,
      "num_tokens": 2687908.0,
      "reward": 0.3343749940395355,
      "reward_std": 0.4513537883758545,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.3726780116558075,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 24,
      "step_time": 82.87710704799974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1449.0,
      "completions/max_terminated_length": 1449.0,
      "completions/mean_length": 164.6875,
      "completions/mean_terminated_length": 164.6875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.2559709567576647,
      "epoch": 0.017857142857142856,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 8.622386932373047,
      "learning_rate": 9.991943444144756e-07,
      "loss": -0.0,
      "num_tokens": 2777256.0,
      "reward": 0.29296875,
      "reward_std": 0.4255591630935669,
      "rewards/format_reward/mean": 0.7421875,
      "rewards/format_reward/std": 0.2816080152988434,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 25,
      "step_time": 100.27635278215166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1396.0,
      "completions/max_terminated_length": 1396.0,
      "completions/mean_length": 82.921875,
      "completions/mean_terminated_length": 82.921875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.24555067718029022,
      "epoch": 0.018571428571428572,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 17.675294876098633,
      "learning_rate": 9.989035196047348e-07,
      "loss": 0.0,
      "num_tokens": 2852203.0,
      "reward": 0.4906250238418579,
      "reward_std": 0.49432292580604553,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.265398770570755,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 26,
      "step_time": 71.82276537799044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 536.0,
      "completions/max_terminated_length": 536.0,
      "completions/mean_length": 41.0625,
      "completions/mean_terminated_length": 41.0625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.23947478830814362,
      "epoch": 0.019285714285714285,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 27.987302780151367,
      "learning_rate": 9.98568022639826e-07,
      "loss": -0.0,
      "num_tokens": 2928439.0,
      "reward": 0.63671875,
      "reward_std": 0.5023154616355896,
      "rewards/format_reward/mean": 0.8984375,
      "rewards/format_reward/std": 0.20275264978408813,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 27,
      "step_time": 26.626900972041767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 691.0,
      "completions/max_terminated_length": 691.0,
      "completions/mean_length": 58.015625,
      "completions/mean_terminated_length": 58.015625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.18591530248522758,
      "epoch": 0.02,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 33.29207992553711,
      "learning_rate": 9.981878835603716e-07,
      "loss": 0.0,
      "num_tokens": 2998840.0,
      "reward": 0.4515625238418579,
      "reward_std": 0.48989540338516235,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.20351573824882507,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 28,
      "step_time": 29.762270515959244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 574.0,
      "completions/max_terminated_length": 574.0,
      "completions/mean_length": 34.6875,
      "completions/mean_terminated_length": 34.6875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.15785411559045315,
      "epoch": 0.020714285714285713,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 21.5506534576416,
      "learning_rate": 9.977631364042794e-07,
      "loss": -0.0,
      "num_tokens": 3069804.0,
      "reward": 0.55078125,
      "reward_std": 0.502493143081665,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 29,
      "step_time": 27.969350750092417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 497.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 26.53125,
      "completions/mean_terminated_length": 26.53125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.1635878887027502,
      "epoch": 0.02142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 17.759307861328125,
      "learning_rate": 9.972938192036944e-07,
      "loss": 0.0,
      "num_tokens": 3150486.0,
      "reward": 0.653124988079071,
      "reward_std": 0.49359992146492004,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.19669894874095917,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 30,
      "step_time": 32.165516318927985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 16.28125,
      "completions/mean_terminated_length": 16.28125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.11400723084807396,
      "epoch": 0.02214285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 7.3703789710998535,
      "learning_rate": 9.967799739815924e-07,
      "loss": 0.0,
      "num_tokens": 3208048.0,
      "reward": 0.4437500238418579,
      "reward_std": 0.4787135720252991,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 31,
      "step_time": 6.480815099028405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 19.875,
      "completions/mean_terminated_length": 19.875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.23133844323456287,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.804015159606934,
      "learning_rate": 9.96221646748019e-07,
      "loss": 0.0,
      "num_tokens": 3307336.0,
      "reward": 0.21484375,
      "reward_std": 0.33814147114753723,
      "rewards/format_reward/mean": 0.8984375,
      "rewards/format_reward/std": 0.25479042530059814,
      "rewards/mcq_exact_match_reward/mean": 0.125,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 32,
      "step_time": 15.765849456947763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.140625,
      "completions/mean_terminated_length": 12.140625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.11367706023156643,
      "epoch": 0.023571428571428573,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 25.840938568115234,
      "learning_rate": 9.956188874959686e-07,
      "loss": 0.0,
      "num_tokens": 3388185.0,
      "reward": 0.36328125,
      "reward_std": 0.44670239090919495,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 33,
      "step_time": 3.9221740990760736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 97.0,
      "completions/max_terminated_length": 97.0,
      "completions/mean_length": 14.109375,
      "completions/mean_terminated_length": 14.109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "entropy": 0.12458794936537743,
      "epoch": 0.024285714285714285,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.921568870544434,
      "learning_rate": 9.949717501969079e-07,
      "loss": 0.0,
      "num_tokens": 3454872.0,
      "reward": 0.5679687857627869,
      "reward_std": 0.5037452578544617,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 34,
      "step_time": 5.632602730009239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 15.34375,
      "completions/mean_terminated_length": 15.34375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.11258962377905846,
      "epoch": 0.025,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.26678466796875,
      "learning_rate": 9.942802927959442e-07,
      "loss": 0.0,
      "num_tokens": 3534958.0,
      "reward": 0.5375000238418579,
      "reward_std": 0.5,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 35,
      "step_time": 9.37483271205565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.328125,
      "completions/mean_terminated_length": 12.328125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.13397582434117794,
      "epoch": 0.025714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.364090442657471,
      "learning_rate": 9.93544577206636e-07,
      "loss": 0.0,
      "num_tokens": 3597171.0,
      "reward": 0.4750000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 36,
      "step_time": 3.0314099779934622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 51.0,
      "completions/max_terminated_length": 51.0,
      "completions/mean_length": 13.078125,
      "completions/mean_terminated_length": 13.078125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.11545340903103352,
      "epoch": 0.02642857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.611798286437988,
      "learning_rate": 9.927646693054495e-07,
      "loss": 0.0,
      "num_tokens": 3696696.0,
      "reward": 0.3656250238418579,
      "reward_std": 0.44515693187713623,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 37,
      "step_time": 8.566134120046627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.484375,
      "completions/mean_terminated_length": 12.484375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.10600519925355911,
      "epoch": 0.027142857142857142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.415499210357666,
      "learning_rate": 9.919406389258606e-07,
      "loss": 0.0,
      "num_tokens": 3774983.0,
      "reward": 0.4437500238418579,
      "reward_std": 0.4787135720252991,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 38,
      "step_time": 3.5948679719585925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.390625,
      "completions/mean_terminated_length": 12.390625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.11269045062363148,
      "epoch": 0.027857142857142858,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 7.707824230194092,
      "learning_rate": 9.910725598521012e-07,
      "loss": -0.0,
      "num_tokens": 3879416.0,
      "reward": 0.7093750238418579,
      "reward_std": 0.4917473793029785,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 39,
      "step_time": 6.437070619082078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 65.0,
      "completions/max_terminated_length": 65.0,
      "completions/mean_length": 14.71875,
      "completions/mean_terminated_length": 14.71875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.1085408478975296,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.853015899658203,
      "learning_rate": 9.901605098125526e-07,
      "loss": 0.0,
      "num_tokens": 3949438.0,
      "reward": 0.7085937857627869,
      "reward_std": 0.49115630984306335,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 40,
      "step_time": 6.119476022082381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 19.484375,
      "completions/mean_terminated_length": 19.484375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09150838013738394,
      "epoch": 0.029285714285714286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.892045704727863e-07,
      "loss": 0.0,
      "num_tokens": 4034149.0,
      "reward": 0.4750000238418579,
      "reward_std": 0.48795002698898315,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 41,
      "step_time": 15.748524485970847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 16.890625,
      "completions/mean_terminated_length": 16.890625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09939474705606699,
      "epoch": 0.03,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.690636157989502,
      "learning_rate": 9.882048274282505e-07,
      "loss": 0.0,
      "num_tokens": 4101726.0,
      "reward": 0.39531251788139343,
      "reward_std": 0.4616841673851013,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 42,
      "step_time": 9.772893431887496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 61.640625,
      "completions/mean_terminated_length": 30.111112594604492,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.11599440686404705,
      "epoch": 0.030714285714285715,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 14.23160457611084,
      "learning_rate": 9.871613701966066e-07,
      "loss": 0.0,
      "num_tokens": 4184607.0,
      "reward": 0.5335937738418579,
      "reward_std": 0.5037994384765625,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 43,
      "step_time": 144.26547849614872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 90.21875,
      "completions/mean_terminated_length": 27.064516067504883,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09353543492034078,
      "epoch": 0.03142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 19.40498924255371,
      "learning_rate": 9.86074292209714e-07,
      "loss": 0.0,
      "num_tokens": 4253973.0,
      "reward": 0.581250011920929,
      "reward_std": 0.5070533752441406,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 44,
      "step_time": 122.23275292402832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 17.015625,
      "completions/mean_terminated_length": 17.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.07891270238906145,
      "epoch": 0.03214285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 6.091732501983643,
      "learning_rate": 9.849436908052636e-07,
      "loss": 0.0,
      "num_tokens": 4362886.0,
      "reward": 0.69140625,
      "reward_std": 0.49646008014678955,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 45,
      "step_time": 16.195965035120025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 349.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 31.546875,
      "completions/mean_terminated_length": 31.546875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "entropy": 0.1003469587303698,
      "epoch": 0.032857142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 13.104848861694336,
      "learning_rate": 9.837696672180618e-07,
      "loss": 0.0,
      "num_tokens": 4433921.0,
      "reward": 0.7718750238418579,
      "reward_std": 0.4732423424720764,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 46,
      "step_time": 26.342010580934584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 81.0,
      "completions/max_terminated_length": 81.0,
      "completions/mean_length": 21.015625,
      "completions/mean_terminated_length": 21.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09486840199679136,
      "epoch": 0.03357142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 5.788740158081055,
      "learning_rate": 9.825523265709665e-07,
      "loss": -0.0,
      "num_tokens": 4514394.0,
      "reward": 0.8460937738418579,
      "reward_std": 0.44179511070251465,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 47,
      "step_time": 6.866083464003168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 18.265625,
      "completions/mean_terminated_length": 18.265625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.07957334164530039,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 11.120720863342285,
      "learning_rate": 9.812917778654747e-07,
      "loss": 0.0,
      "num_tokens": 4614995.0,
      "reward": 0.7523437738418579,
      "reward_std": 0.48439374566078186,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.16194961965084076,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 48,
      "step_time": 13.319389674987178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 63.0,
      "completions/max_terminated_length": 63.0,
      "completions/mean_length": 13.765625,
      "completions/mean_terminated_length": 13.765625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.05717065744102001,
      "epoch": 0.035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.799881339719614e-07,
      "loss": 0.0,
      "num_tokens": 4694860.0,
      "reward": 0.3500000238418579,
      "reward_std": 0.4364357888698578,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 49,
      "step_time": 6.507926742138807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 88.109375,
      "completions/mean_terminated_length": 24.887096405029297,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "entropy": 0.09659092174842954,
      "epoch": 0.03571428571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.933631896972656,
      "learning_rate": 9.786415116195732e-07,
      "loss": 0.0,
      "num_tokens": 4779483.0,
      "reward": 0.706250011920929,
      "reward_std": 0.49597588181495667,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 50,
      "step_time": 125.25686850992497
    }
  ],
  "logging_steps": 1,
  "max_steps": 350,
  "num_input_tokens_seen": 4779483,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}