{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0666666666666667,
  "eval_steps": 500,
  "global_step": 160,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2125,
      "completions/max_length": 490.8,
      "completions/max_terminated_length": 470.6,
      "completions/mean_length": 414.4625,
      "completions/mean_terminated_length": 395.1430999755859,
      "completions/min_length": 310.5,
      "completions/min_terminated_length": 310.5,
      "entropy": 0.5249067967757582,
      "epoch": 0.06666666666666667,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.65625,
      "kl": 0.03049815017875517,
      "learning_rate": 9.4375e-06,
      "loss": -0.010575222969055175,
      "num_tokens": 46025.0,
      "reward": 0.73009033203125,
      "reward_std": 0.4704558838158846,
      "rewards/JointRewardFunction/mean": 0.73009033203125,
      "rewards/JointRewardFunction/std": 0.47045588716864584,
      "step": 10,
      "step_time": 21.721466124300058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15,
      "completions/max_length": 474.6,
      "completions/max_terminated_length": 432.8,
      "completions/mean_length": 372.65,
      "completions/mean_terminated_length": 348.14678955078125,
      "completions/min_length": 280.5,
      "completions/min_terminated_length": 280.5,
      "entropy": 0.4361519979313016,
      "epoch": 0.13333333333333333,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.15625,
      "kl": 0.0652532160282135,
      "learning_rate": 8.8125e-06,
      "loss": 0.016564452648162843,
      "num_tokens": 89597.0,
      "reward": 0.95604248046875,
      "reward_std": 0.5059975624084473,
      "rewards/JointRewardFunction/mean": 0.95604248046875,
      "rewards/JointRewardFunction/std": 0.5059975773096085,
      "step": 20,
      "step_time": 22.023339059400495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0375,
      "completions/max_length": 425.6,
      "completions/max_terminated_length": 406.9,
      "completions/mean_length": 317.9,
      "completions/mean_terminated_length": 309.6845275878906,
      "completions/min_length": 231.4,
      "completions/min_terminated_length": 231.4,
      "entropy": 0.45581948235630987,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 2.859375,
      "kl": 0.1008026220370084,
      "learning_rate": 8.1875e-06,
      "loss": 0.01793680489063263,
      "num_tokens": 126445.0,
      "reward": 1.2108154296875,
      "reward_std": 0.40027157836593685,
      "rewards/JointRewardFunction/mean": 1.2108154296875,
      "rewards/JointRewardFunction/std": 0.40027157838921995,
      "step": 30,
      "step_time": 19.79602696299935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1125,
      "completions/max_length": 493.9,
      "completions/max_terminated_length": 468.6,
      "completions/mean_length": 352.1875,
      "completions/mean_terminated_length": 335.66607666015625,
      "completions/min_length": 238.5,
      "completions/min_terminated_length": 238.5,
      "entropy": 0.4116522930562496,
      "epoch": 0.26666666666666666,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.5625,
      "kl": 0.12701121605932714,
      "learning_rate": 7.5625e-06,
      "loss": 0.05010480284690857,
      "num_tokens": 167932.0,
      "reward": 1.2074462890625,
      "reward_std": 0.42208707332611084,
      "rewards/JointRewardFunction/mean": 1.2074462890625,
      "rewards/JointRewardFunction/std": 0.4220870822668076,
      "step": 40,
      "step_time": 22.582551179301117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 393.4,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 300.8625,
      "completions/mean_terminated_length": 296.27321472167966,
      "completions/min_length": 226.3,
      "completions/min_terminated_length": 226.3,
      "entropy": 0.4190680437721312,
      "epoch": 0.3333333333333333,
      "frac_reward_zero_std": 0.35,
      "grad_norm": 3.25,
      "kl": 0.13846059744246303,
      "learning_rate": 6.9375e-06,
      "loss": 0.03549057841300964,
      "num_tokens": 204717.0,
      "reward": 1.26171875,
      "reward_std": 0.38662562653189525,
      "rewards/JointRewardFunction/mean": 1.26171875,
      "rewards/JointRewardFunction/std": 0.3866256324923597,
      "step": 50,
      "step_time": 18.59559666490022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 426.2,
      "completions/max_terminated_length": 414.2,
      "completions/mean_length": 285.9125,
      "completions/mean_terminated_length": 280.5291687011719,
      "completions/min_length": 190.9,
      "completions/min_terminated_length": 190.9,
      "entropy": 0.3493430153466761,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 2.375,
      "kl": 0.15198964411392807,
      "learning_rate": 6.3125e-06,
      "loss": 0.01220681592822075,
      "num_tokens": 240990.0,
      "reward": 1.27255859375,
      "reward_std": 0.34817005618242547,
      "rewards/JointRewardFunction/mean": 1.27255859375,
      "rewards/JointRewardFunction/std": 0.34817005618242547,
      "step": 60,
      "step_time": 19.868489251599385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 370.4,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 277.2875,
      "completions/mean_terminated_length": 272.9107177734375,
      "completions/min_length": 179.9,
      "completions/min_terminated_length": 179.9,
      "entropy": 0.34267437979578974,
      "epoch": 0.4666666666666667,
      "frac_reward_zero_std": 0.55,
      "grad_norm": 0.017333984375,
      "kl": 0.18903981931507588,
      "learning_rate": 5.6875e-06,
      "loss": 0.019876784086227416,
      "num_tokens": 276969.0,
      "reward": 1.381591796875,
      "reward_std": 0.2380124439485371,
      "rewards/JointRewardFunction/mean": 1.381591796875,
      "rewards/JointRewardFunction/std": 0.23801244990900158,
      "step": 70,
      "step_time": 17.71040062670145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.1,
      "completions/max_terminated_length": 378.1,
      "completions/mean_length": 271.3625,
      "completions/mean_terminated_length": 271.3625,
      "completions/min_length": 171.4,
      "completions/min_terminated_length": 171.4,
      "entropy": 0.3542415237054229,
      "epoch": 0.5333333333333333,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0751953125,
      "kl": 0.19187260391190647,
      "learning_rate": 5.0625e-06,
      "loss": 0.0034067176282405855,
      "num_tokens": 312350.0,
      "reward": 1.40601806640625,
      "reward_std": 0.2126459252787754,
      "rewards/JointRewardFunction/mean": 1.40601806640625,
      "rewards/JointRewardFunction/std": 0.21264592825900763,
      "step": 80,
      "step_time": 18.04391895070148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 357.6,
      "completions/max_terminated_length": 341.1,
      "completions/mean_length": 272.275,
      "completions/mean_terminated_length": 260.9375,
      "completions/min_length": 171.3,
      "completions/min_terminated_length": 171.3,
      "entropy": 0.31064137276262044,
      "epoch": 0.6,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 1.875,
      "kl": 0.20775549318641423,
      "learning_rate": 4.4375e-06,
      "loss": 0.0008514203131198883,
      "num_tokens": 348280.0,
      "reward": 1.353466796875,
      "reward_std": 0.21437984704971313,
      "rewards/JointRewardFunction/mean": 1.353466796875,
      "rewards/JointRewardFunction/std": 0.21437986195087433,
      "step": 90,
      "step_time": 17.224812426199787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 386.0,
      "completions/max_terminated_length": 357.8,
      "completions/mean_length": 267.65,
      "completions/mean_terminated_length": 250.7500030517578,
      "completions/min_length": 169.7,
      "completions/min_terminated_length": 169.7,
      "entropy": 0.3662784457206726,
      "epoch": 0.6666666666666666,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.01239013671875,
      "kl": 0.21083315466530622,
      "learning_rate": 3.8125e-06,
      "loss": 0.011665140837430954,
      "num_tokens": 384576.0,
      "reward": 1.34949951171875,
      "reward_std": 0.23986690491437912,
      "rewards/JointRewardFunction/mean": 1.34949951171875,
      "rewards/JointRewardFunction/std": 0.2398669108748436,
      "step": 100,
      "step_time": 18.349693166400904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 382.2,
      "completions/max_terminated_length": 364.2,
      "completions/mean_length": 275.575,
      "completions/mean_terminated_length": 272.3714294433594,
      "completions/min_length": 188.2,
      "completions/min_terminated_length": 188.2,
      "entropy": 0.37121466230601075,
      "epoch": 0.7333333333333333,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.0146484375,
      "kl": 0.21329910093918442,
      "learning_rate": 3.1875e-06,
      "loss": 0.007188273221254348,
      "num_tokens": 419570.0,
      "reward": 1.350439453125,
      "reward_std": 0.23384397297631948,
      "rewards/JointRewardFunction/mean": 1.350439453125,
      "rewards/JointRewardFunction/std": 0.23384397297631948,
      "step": 110,
      "step_time": 18.14425329649821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 346.6,
      "completions/max_terminated_length": 345.3,
      "completions/mean_length": 252.15,
      "completions/mean_terminated_length": 250.38750305175782,
      "completions/min_length": 166.8,
      "completions/min_terminated_length": 166.8,
      "entropy": 0.4173679456114769,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 2.09375,
      "kl": 0.21794578088447453,
      "learning_rate": 2.5625e-06,
      "loss": -0.0005294814705848694,
      "num_tokens": 452006.0,
      "reward": 1.35863037109375,
      "reward_std": 0.24987269788980485,
      "rewards/JointRewardFunction/mean": 1.35863037109375,
      "rewards/JointRewardFunction/std": 0.24987269788980485,
      "step": 120,
      "step_time": 16.82072365879685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0375,
      "completions/max_length": 441.8,
      "completions/max_terminated_length": 440.3,
      "completions/mean_length": 301.7375,
      "completions/mean_terminated_length": 296.75750122070315,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.33190380278974774,
      "epoch": 0.8666666666666667,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.09375,
      "kl": 0.18061227248981596,
      "learning_rate": 1.9375e-06,
      "loss": 0.008612716197967529,
      "num_tokens": 490397.0,
      "reward": 1.374853515625,
      "reward_std": 0.2561936320271343,
      "rewards/JointRewardFunction/mean": 1.374853515625,
      "rewards/JointRewardFunction/std": 0.2561936320271343,
      "step": 130,
      "step_time": 20.63984096989916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 340.1,
      "completions/max_terminated_length": 340.1,
      "completions/mean_length": 256.5,
      "completions/mean_terminated_length": 256.5,
      "completions/min_length": 179.7,
      "completions/min_terminated_length": 179.7,
      "entropy": 0.38072127737104894,
      "epoch": 0.9333333333333333,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 2.96875,
      "kl": 0.21334810927510262,
      "learning_rate": 1.3125000000000001e-06,
      "loss": 0.012666280567646026,
      "num_tokens": 523549.0,
      "reward": 1.436767578125,
      "reward_std": 0.153020023368299,
      "rewards/JointRewardFunction/mean": 1.436767578125,
      "rewards/JointRewardFunction/std": 0.153020023368299,
      "step": 140,
      "step_time": 16.57510228729916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 371.9,
      "completions/max_terminated_length": 370.6,
      "completions/mean_length": 257.4125,
      "completions/mean_terminated_length": 254.5607147216797,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3929610840976238,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 0.01513671875,
      "kl": 0.21818328225053846,
      "learning_rate": 6.875000000000001e-07,
      "loss": 0.011788636445999146,
      "num_tokens": 556454.0,
      "reward": 1.348779296875,
      "reward_std": 0.3280519276857376,
      "rewards/JointRewardFunction/mean": 1.348779296875,
      "rewards/JointRewardFunction/std": 0.32805192805826666,
      "step": 150,
      "step_time": 17.876252979701167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 343.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 240.45,
      "completions/mean_terminated_length": 240.45,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.4010548871010542,
      "epoch": 1.0666666666666667,
      "frac_reward_zero_std": 0.65,
      "grad_norm": 0.025146484375,
      "kl": 0.22879955088719725,
      "learning_rate": 6.250000000000001e-08,
      "loss": 0.022160810232162476,
      "num_tokens": 587606.0,
      "reward": 1.3880859375,
      "reward_std": 0.237497678399086,
      "rewards/JointRewardFunction/mean": 1.3880859375,
      "rewards/JointRewardFunction/std": 0.2374976843595505,
      "step": 160,
      "step_time": 16.50347660660045
    }
  ],
  "logging_steps": 10,
  "max_steps": 160,
  "num_input_tokens_seen": 587606,
  "num_train_epochs": 2,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}