{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.28,
  "eval_steps": 500,
  "global_step": 160,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 959.3,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 628.5875,
      "completions/mean_terminated_length": 604.4089508056641,
      "completions/min_length": 324.2,
      "completions/min_terminated_length": 324.2,
      "entropy": 0.8960087668150664,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.225,
      "grad_norm": 1.734375,
      "kl": 0.04410275101472507,
      "learning_rate": 9.4375e-06,
      "loss": -0.004480601102113724,
      "num_tokens": 70931.0,
      "reward": 0.637615966796875,
      "reward_std": 0.4684956520795822,
      "rewards/JointRewardFunction/mean": 0.637615966796875,
      "rewards/JointRewardFunction/std": 0.4684956640005112,
      "step": 10,
      "step_time": 39.2735027824996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1375,
      "completions/max_length": 943.7,
      "completions/max_terminated_length": 829.8,
      "completions/mean_length": 631.4625,
      "completions/mean_terminated_length": 579.1692932128906,
      "completions/min_length": 324.2,
      "completions/min_terminated_length": 324.2,
      "entropy": 1.0276442520320415,
      "epoch": 0.16,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.703125,
      "kl": 0.08849158070515842,
      "learning_rate": 8.8125e-06,
      "loss": 0.01496470272541046,
      "num_tokens": 142092.0,
      "reward": 0.539324951171875,
      "reward_std": 0.49008582532405853,
      "rewards/JointRewardFunction/mean": 0.539324951171875,
      "rewards/JointRewardFunction/std": 0.4900858402252197,
      "step": 20,
      "step_time": 40.54311619299951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 863.4,
      "completions/max_terminated_length": 835.9,
      "completions/mean_length": 554.5875,
      "completions/mean_terminated_length": 548.3660766601563,
      "completions/min_length": 307.3,
      "completions/min_terminated_length": 307.3,
      "entropy": 1.1838637091219426,
      "epoch": 0.24,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 3.046875,
      "kl": 0.11827877229079604,
      "learning_rate": 8.1875e-06,
      "loss": 0.009982097893953323,
      "num_tokens": 206903.0,
      "reward": 0.6125,
      "reward_std": 0.4704344987869263,
      "rewards/JointRewardFunction/mean": 0.6125,
      "rewards/JointRewardFunction/std": 0.47043450474739074,
      "step": 30,
      "step_time": 37.010521245800916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 742.1,
      "completions/max_terminated_length": 659.0,
      "completions/mean_length": 425.7,
      "completions/mean_terminated_length": 410.8892883300781,
      "completions/min_length": 219.2,
      "completions/min_terminated_length": 219.2,
      "entropy": 1.2652597405016421,
      "epoch": 0.32,
      "frac_reward_zero_std": 0.15,
      "grad_norm": 6.125,
      "kl": 0.3052154924720526,
      "learning_rate": 7.5625e-06,
      "loss": 0.027592796087265014,
      "num_tokens": 261533.0,
      "reward": 0.61771240234375,
      "reward_std": 0.46430147290229795,
      "rewards/JointRewardFunction/mean": 0.61771240234375,
      "rewards/JointRewardFunction/std": 0.46430149376392366,
      "step": 40,
      "step_time": 32.063390286398864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0875,
      "completions/max_length": 919.3,
      "completions/max_terminated_length": 713.2,
      "completions/mean_length": 511.4125,
      "completions/mean_terminated_length": 464.44822998046874,
      "completions/min_length": 244.6,
      "completions/min_terminated_length": 244.6,
      "entropy": 1.3154176332056522,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 3.6875,
      "kl": 0.24474074998870493,
      "learning_rate": 6.9375e-06,
      "loss": 0.001818625070154667,
      "num_tokens": 323276.0,
      "reward": 0.537213134765625,
      "reward_std": 0.5033262223005295,
      "rewards/JointRewardFunction/mean": 0.537213134765625,
      "rewards/JointRewardFunction/std": 0.5033262312412262,
      "step": 50,
      "step_time": 39.09056931000159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 824.5,
      "completions/max_terminated_length": 711.6,
      "completions/mean_length": 520.4625,
      "completions/mean_terminated_length": 484.48512268066406,
      "completions/min_length": 261.1,
      "completions/min_terminated_length": 261.1,
      "entropy": 1.2718341693282127,
      "epoch": 0.48,
      "frac_reward_zero_std": 0.225,
      "grad_norm": 4.6875,
      "kl": 0.2580411507748067,
      "learning_rate": 6.3125e-06,
      "loss": 0.014349016547203063,
      "num_tokens": 385253.0,
      "reward": 0.54111328125,
      "reward_std": 0.4758811920881271,
      "rewards/JointRewardFunction/mean": 0.54111328125,
      "rewards/JointRewardFunction/std": 0.47588120102882386,
      "step": 60,
      "step_time": 35.274126689498736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 943.6,
      "completions/max_terminated_length": 808.7,
      "completions/mean_length": 537.75,
      "completions/mean_terminated_length": 502.15358276367186,
      "completions/min_length": 267.1,
      "completions/min_terminated_length": 267.1,
      "entropy": 1.244665590673685,
      "epoch": 0.56,
      "frac_reward_zero_std": 0.175,
      "grad_norm": 4.0625,
      "kl": 0.24244523425586523,
      "learning_rate": 5.6875e-06,
      "loss": 0.0023317448794841766,
      "num_tokens": 448729.0,
      "reward": 0.69791259765625,
      "reward_std": 0.4654367908835411,
      "rewards/JointRewardFunction/mean": 0.69791259765625,
      "rewards/JointRewardFunction/std": 0.4654367953538895,
      "step": 70,
      "step_time": 40.008808337500525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 810.1,
      "completions/max_terminated_length": 720.2,
      "completions/mean_length": 534.1625,
      "completions/mean_terminated_length": 508.99286193847655,
      "completions/min_length": 286.4,
      "completions/min_terminated_length": 286.4,
      "entropy": 1.2685907267034053,
      "epoch": 0.64,
      "frac_reward_zero_std": 0.225,
      "grad_norm": 4.125,
      "kl": 0.2162147051654756,
      "learning_rate": 5.0625e-06,
      "loss": 0.014069165289402007,
      "num_tokens": 512076.0,
      "reward": 0.728045654296875,
      "reward_std": 0.5096077308058738,
      "rewards/JointRewardFunction/mean": 0.728045654296875,
      "rewards/JointRewardFunction/std": 0.5096077516674995,
      "step": 80,
      "step_time": 34.66793936170143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 793.3,
      "completions/max_terminated_length": 733.4,
      "completions/mean_length": 492.2,
      "completions/mean_terminated_length": 478.558935546875,
      "completions/min_length": 245.4,
      "completions/min_terminated_length": 245.4,
      "entropy": 1.24496211335063,
      "epoch": 0.72,
      "frac_reward_zero_std": 0.175,
      "grad_norm": 4.03125,
      "kl": 0.24482853645458819,
      "learning_rate": 4.4375e-06,
      "loss": 0.032863426208496097,
      "num_tokens": 572072.0,
      "reward": 0.64200439453125,
      "reward_std": 0.5001831084489823,
      "rewards/JointRewardFunction/mean": 0.64200439453125,
      "rewards/JointRewardFunction/std": 0.5001831203699112,
      "step": 90,
      "step_time": 34.359502547597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 768.8,
      "completions/max_terminated_length": 768.8,
      "completions/mean_length": 475.65,
      "completions/mean_terminated_length": 475.65,
      "completions/min_length": 258.8,
      "completions/min_terminated_length": 258.8,
      "entropy": 1.330046895891428,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.175,
      "grad_norm": 3.375,
      "kl": 0.24854949009604752,
      "learning_rate": 3.8125e-06,
      "loss": 0.0744681715965271,
      "num_tokens": 630790.0,
      "reward": 0.725848388671875,
      "reward_std": 0.4486017137765884,
      "rewards/JointRewardFunction/mean": 0.725848388671875,
      "rewards/JointRewardFunction/std": 0.4486017107963562,
      "step": 100,
      "step_time": 33.44759687739897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 695.2,
      "completions/max_terminated_length": 669.5,
      "completions/mean_length": 443.0125,
      "completions/mean_terminated_length": 436.7017883300781,
      "completions/min_length": 213.1,
      "completions/min_terminated_length": 213.1,
      "entropy": 1.3013170935213565,
      "epoch": 0.88,
      "frac_reward_zero_std": 0.275,
      "grad_norm": 3.984375,
      "kl": 0.3150750307366252,
      "learning_rate": 3.1875e-06,
      "loss": 0.020898757874965666,
      "num_tokens": 686775.0,
      "reward": 0.71988525390625,
      "reward_std": 0.515497374534607,
      "rewards/JointRewardFunction/mean": 0.71988525390625,
      "rewards/JointRewardFunction/std": 0.515497374534607,
      "step": 110,
      "step_time": 30.408145976099696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 757.0,
      "completions/max_terminated_length": 667.6,
      "completions/mean_length": 428.675,
      "completions/mean_terminated_length": 412.0500061035156,
      "completions/min_length": 201.6,
      "completions/min_terminated_length": 201.6,
      "entropy": 1.2176271453499794,
      "epoch": 0.96,
      "frac_reward_zero_std": 0.175,
      "grad_norm": 6.53125,
      "kl": 0.29011352979578076,
      "learning_rate": 2.5625e-06,
      "loss": 0.023420125246047974,
      "num_tokens": 741441.0,
      "reward": 0.738653564453125,
      "reward_std": 0.45226994156837463,
      "rewards/JointRewardFunction/mean": 0.738653564453125,
      "rewards/JointRewardFunction/std": 0.45226994901895523,
      "step": 120,
      "step_time": 32.81503715240033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 645.8,
      "completions/max_terminated_length": 645.8,
      "completions/mean_length": 429.1875,
      "completions/mean_terminated_length": 429.1875,
      "completions/min_length": 218.8,
      "completions/min_terminated_length": 218.8,
      "entropy": 1.313951000571251,
      "epoch": 1.04,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 3.78125,
      "kl": 0.3095056655351073,
      "learning_rate": 1.9375e-06,
      "loss": 0.031660494208335874,
      "num_tokens": 796394.0,
      "reward": 0.7450439453125,
      "reward_std": 0.46799357831478117,
      "rewards/JointRewardFunction/mean": 0.7450439453125,
      "rewards/JointRewardFunction/std": 0.46799357831478117,
      "step": 130,
      "step_time": 28.36433180910135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 661.5,
      "completions/max_terminated_length": 661.5,
      "completions/mean_length": 416.2625,
      "completions/mean_terminated_length": 416.2625,
      "completions/min_length": 235.6,
      "completions/min_terminated_length": 235.6,
      "entropy": 1.3542070075869561,
      "epoch": 1.12,
      "frac_reward_zero_std": 0.2,
      "grad_norm": 2.328125,
      "kl": 0.3334041152149439,
      "learning_rate": 1.3125000000000001e-06,
      "loss": 0.022475141286849975,
      "num_tokens": 850343.0,
      "reward": 0.67725830078125,
      "reward_std": 0.3997616931796074,
      "rewards/JointRewardFunction/mean": 0.67725830078125,
      "rewards/JointRewardFunction/std": 0.399761700630188,
      "step": 140,
      "step_time": 28.77159399340053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 703.9,
      "completions/max_terminated_length": 703.9,
      "completions/mean_length": 422.925,
      "completions/mean_terminated_length": 422.925,
      "completions/min_length": 209.7,
      "completions/min_terminated_length": 209.7,
      "entropy": 1.2913881182670592,
      "epoch": 1.2,
      "frac_reward_zero_std": 0.2,
      "grad_norm": 3.734375,
      "kl": 0.31345505844801663,
      "learning_rate": 6.875000000000001e-07,
      "loss": 0.029654264450073242,
      "num_tokens": 904753.0,
      "reward": 0.79158935546875,
      "reward_std": 0.4540836468338966,
      "rewards/JointRewardFunction/mean": 0.79158935546875,
      "rewards/JointRewardFunction/std": 0.45408365726470945,
      "step": 150,
      "step_time": 30.629617839398996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 645.8,
      "completions/max_terminated_length": 645.8,
      "completions/mean_length": 411.0125,
      "completions/mean_terminated_length": 411.0125,
      "completions/min_length": 207.4,
      "completions/min_terminated_length": 207.4,
      "entropy": 1.2935365058481694,
      "epoch": 1.28,
      "frac_reward_zero_std": 0.175,
      "grad_norm": 3.171875,
      "kl": 0.3413598489947617,
      "learning_rate": 6.250000000000001e-08,
      "loss": -0.008248078823089599,
      "num_tokens": 958536.0,
      "reward": 0.695233154296875,
      "reward_std": 0.40624782145023347,
      "rewards/JointRewardFunction/mean": 0.695233154296875,
      "rewards/JointRewardFunction/std": 0.40624783337116244,
      "step": 160,
      "step_time": 28.084300646101475
    }
  ],
  "logging_steps": 10,
  "max_steps": 160,
  "num_input_tokens_seen": 958536,
  "num_train_epochs": 2,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}