exp2-qwen-mbpp-s123-lambda-…/last-checkpoint/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.0,
  "eval_steps": 500,
  "global_step": 300,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4125,
      "completions/max_length": 499.4,
      "completions/max_terminated_length": 416.1,
      "completions/mean_length": 440.2,
      "completions/mean_terminated_length": 360.0207153320313,
      "completions/min_length": 351.3,
      "completions/min_terminated_length": 300.1,
      "entropy": 0.35336310751736166,
      "epoch": 0.06666666666666667,
      "frac_reward_zero_std": 0.15,
      "grad_norm": 2.46875,
      "kl": 0.04995681893542496,
      "learning_rate": 9.7e-06,
      "loss": 0.007881630957126618,
      "num_tokens": 48280.0,
      "reward": 0.6638354301452637,
      "reward_std": 0.4613180309534073,
      "rewards/JointRewardFunction/mean": 0.6638354301452637,
      "rewards/JointRewardFunction/std": 0.46131803542375566,
      "step": 10,
      "step_time": 21.880370603101618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4125,
      "completions/max_length": 505.1,
      "completions/max_terminated_length": 485.6,
      "completions/mean_length": 454.75,
      "completions/mean_terminated_length": 429.1646453857422,
      "completions/min_length": 359.6,
      "completions/min_terminated_length": 359.6,
      "entropy": 0.40911334455013276,
      "epoch": 0.13333333333333333,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 1.859375,
      "kl": 0.06663629063405097,
      "learning_rate": 9.366666666666668e-06,
      "loss": -0.007935921847820281,
      "num_tokens": 97272.0,
      "reward": 0.742634254693985,
      "reward_std": 0.5053252905607224,
      "rewards/JointRewardFunction/mean": 0.742634254693985,
      "rewards/JointRewardFunction/std": 0.505325311422348,
      "step": 20,
      "step_time": 22.897377014198717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3625,
      "completions/max_length": 496.7,
      "completions/max_terminated_length": 476.1,
      "completions/mean_length": 449.0625,
      "completions/mean_terminated_length": 423.77500610351564,
      "completions/min_length": 347.1,
      "completions/min_terminated_length": 347.1,
      "entropy": 0.5090887371450663,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.40625,
      "kl": 0.05061149680987,
      "learning_rate": 9.033333333333334e-06,
      "loss": -0.0031354159116744997,
      "num_tokens": 145589.0,
      "reward": 0.9517577826976776,
      "reward_std": 0.356577847735025,
      "rewards/JointRewardFunction/mean": 0.9517577826976776,
      "rewards/JointRewardFunction/std": 0.35657785963267086,
      "step": 30,
      "step_time": 22.42113570249967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 508.9,
      "completions/max_terminated_length": 420.2,
      "completions/mean_length": 447.0,
      "completions/mean_terminated_length": 361.18226623535156,
      "completions/min_length": 350.8,
      "completions/min_terminated_length": 299.6,
      "entropy": 0.5721639156341553,
      "epoch": 0.26666666666666666,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 2.765625,
      "kl": 0.05449348199181259,
      "learning_rate": 8.700000000000001e-06,
      "loss": -0.0012190598994493485,
      "num_tokens": 195445.0,
      "reward": 0.9749389350414276,
      "reward_std": 0.41350028812885287,
      "rewards/JointRewardFunction/mean": 0.9749389350414276,
      "rewards/JointRewardFunction/std": 0.4135002911090851,
      "step": 40,
      "step_time": 22.95661881720298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1625,
      "completions/max_length": 467.3,
      "completions/max_terminated_length": 454.5,
      "completions/mean_length": 390.775,
      "completions/mean_terminated_length": 377.6511932373047,
      "completions/min_length": 296.3,
      "completions/min_terminated_length": 296.3,
      "entropy": 0.6591073881834746,
      "epoch": 0.3333333333333333,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 3.03125,
      "kl": 0.08812928411643953,
      "learning_rate": 8.366666666666667e-06,
      "loss": -0.008682972937822341,
      "num_tokens": 240011.0,
      "reward": 1.1314965546131135,
      "reward_std": 0.2849295660853386,
      "rewards/JointRewardFunction/mean": 1.1314965546131135,
      "rewards/JointRewardFunction/std": 0.2849295552819967,
      "step": 50,
      "step_time": 21.223023884699796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.175,
      "completions/max_length": 473.6,
      "completions/max_terminated_length": 447.8,
      "completions/mean_length": 379.525,
      "completions/mean_terminated_length": 354.0325042724609,
      "completions/min_length": 264.9,
      "completions/min_terminated_length": 264.9,
      "entropy": 0.6001430394127965,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.35,
      "grad_norm": 2.71875,
      "kl": 0.07532973024062813,
      "learning_rate": 8.033333333333335e-06,
      "loss": 0.010028349608182907,
      "num_tokens": 283417.0,
      "reward": 1.055295366048813,
      "reward_std": 0.3583150297403336,
      "rewards/JointRewardFunction/mean": 1.055295366048813,
      "rewards/JointRewardFunction/std": 0.358315047621727,
      "step": 60,
      "step_time": 21.51059049019932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 491.9,
      "completions/max_terminated_length": 471.2,
      "completions/mean_length": 376.875,
      "completions/mean_terminated_length": 364.50536193847654,
      "completions/min_length": 270.7,
      "completions/min_terminated_length": 270.7,
      "entropy": 0.5387065753340721,
      "epoch": 0.4666666666666667,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 1.9375,
      "kl": 0.07996222919318825,
      "learning_rate": 7.7e-06,
      "loss": 0.01158405989408493,
      "num_tokens": 326415.0,
      "reward": 1.21389399766922,
      "reward_std": 0.20652158036828042,
      "rewards/JointRewardFunction/mean": 1.21389399766922,
      "rewards/JointRewardFunction/std": 0.20652157738804816,
      "step": 70,
      "step_time": 22.325782465600422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 452.9,
      "completions/max_terminated_length": 424.3,
      "completions/mean_length": 341.4625,
      "completions/mean_terminated_length": 331.04286193847656,
      "completions/min_length": 254.4,
      "completions/min_terminated_length": 254.4,
      "entropy": 0.6442078746855259,
      "epoch": 0.5333333333333333,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 2.8125,
      "kl": 0.12920588869601488,
      "learning_rate": 7.3666666666666676e-06,
      "loss": 0.026441246271133423,
      "num_tokens": 366144.0,
      "reward": 1.1669726014137267,
      "reward_std": 0.2815501570701599,
      "rewards/JointRewardFunction/mean": 1.1669726014137267,
      "rewards/JointRewardFunction/std": 0.28155014626681807,
      "step": 80,
      "step_time": 20.707106610499977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 458.6,
      "completions/max_terminated_length": 436.6,
      "completions/mean_length": 344.7875,
      "completions/mean_terminated_length": 332.6904815673828,
      "completions/min_length": 257.1,
      "completions/min_terminated_length": 257.1,
      "entropy": 0.6171283535659313,
      "epoch": 0.6,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 0.1728515625,
      "kl": 0.11114234835840761,
      "learning_rate": 7.033333333333334e-06,
      "loss": -0.0013597654178738594,
      "num_tokens": 406607.0,
      "reward": 1.0987597286701203,
      "reward_std": 0.3017842784523964,
      "rewards/JointRewardFunction/mean": 1.0987597286701203,
      "rewards/JointRewardFunction/std": 0.3017842710018158,
      "step": 90,
      "step_time": 21.128214740598196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 445.1,
      "completions/max_terminated_length": 418.1,
      "completions/mean_length": 350.3625,
      "completions/mean_terminated_length": 341.49822387695315,
      "completions/min_length": 258.7,
      "completions/min_terminated_length": 258.7,
      "entropy": 0.7072938833385706,
      "epoch": 0.6666666666666666,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.6875,
      "kl": 0.10777388750575483,
      "learning_rate": 6.700000000000001e-06,
      "loss": 0.009119665622711182,
      "num_tokens": 446820.0,
      "reward": 1.167529249191284,
      "reward_std": 0.27103030947037043,
      "rewards/JointRewardFunction/mean": 1.167529249191284,
      "rewards/JointRewardFunction/std": 0.2710303008556366,
      "step": 100,
      "step_time": 20.569578059400374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1,
      "completions/max_length": 497.2,
      "completions/max_terminated_length": 483.4,
      "completions/mean_length": 386.8,
      "completions/mean_terminated_length": 376.6464324951172,
      "completions/min_length": 274.3,
      "completions/min_terminated_length": 274.3,
      "entropy": 0.5864197112619877,
      "epoch": 0.7333333333333333,
      "frac_reward_zero_std": 0.55,
      "grad_norm": 1.953125,
      "kl": 0.10330515620298683,
      "learning_rate": 6.366666666666668e-06,
      "loss": 0.004881048575043678,
      "num_tokens": 491392.0,
      "reward": 1.1497362732887269,
      "reward_std": 0.2508866846153978,
      "rewards/JointRewardFunction/mean": 1.1497362732887269,
      "rewards/JointRewardFunction/std": 0.25088667746749704,
      "step": 110,
      "step_time": 22.38237403490093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1375,
      "completions/max_length": 489.8,
      "completions/max_terminated_length": 455.9,
      "completions/mean_length": 402.7,
      "completions/mean_terminated_length": 384.0738189697266,
      "completions/min_length": 297.5,
      "completions/min_terminated_length": 297.5,
      "entropy": 0.5570502711459995,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 2.265625,
      "kl": 0.08270290573127567,
      "learning_rate": 6.033333333333335e-06,
      "loss": 0.013686606287956237,
      "num_tokens": 537628.0,
      "reward": 1.0768773972988128,
      "reward_std": 0.32933869063854215,
      "rewards/JointRewardFunction/mean": 1.0768773972988128,
      "rewards/JointRewardFunction/std": 0.32933869063854215,
      "step": 120,
      "step_time": 22.374344250299693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1625,
      "completions/max_length": 474.2,
      "completions/max_terminated_length": 460.2,
      "completions/mean_length": 382.75,
      "completions/mean_terminated_length": 366.1206024169922,
      "completions/min_length": 286.6,
      "completions/min_terminated_length": 286.6,
      "entropy": 0.5301733467727899,
      "epoch": 0.8666666666666667,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 2.484375,
      "kl": 0.09477438307367266,
      "learning_rate": 5.7e-06,
      "loss": -0.0015326094813644885,
      "num_tokens": 583628.0,
      "reward": 1.1464037537574767,
      "reward_std": 0.26481162309646605,
      "rewards/JointRewardFunction/mean": 1.1464037537574767,
      "rewards/JointRewardFunction/std": 0.264811622351408,
      "step": 130,
      "step_time": 21.819646276899583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.175,
      "completions/max_length": 492.1,
      "completions/max_terminated_length": 456.2,
      "completions/mean_length": 383.6375,
      "completions/mean_terminated_length": 364.0131011962891,
      "completions/min_length": 279.7,
      "completions/min_terminated_length": 279.7,
      "entropy": 0.45302344355732205,
      "epoch": 0.9333333333333333,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 3.4375,
      "kl": 0.07856867800001055,
      "learning_rate": 5.366666666666666e-06,
      "loss": 0.027430105209350585,
      "num_tokens": 628107.0,
      "reward": 1.1271068811416627,
      "reward_std": 0.3177687225921545,
      "rewards/JointRewardFunction/mean": 1.1271068811416627,
      "rewards/JointRewardFunction/std": 0.31776872408227064,
      "step": 140,
      "step_time": 22.27468511669831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.175,
      "completions/max_length": 488.7,
      "completions/max_terminated_length": 460.8,
      "completions/mean_length": 384.025,
      "completions/mean_terminated_length": 360.3232177734375,
      "completions/min_length": 279.9,
      "completions/min_terminated_length": 279.9,
      "entropy": 0.4796965181827545,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 0.08740234375,
      "kl": 0.09200209667906165,
      "learning_rate": 5.033333333333333e-06,
      "loss": -0.009858855605125427,
      "num_tokens": 671557.0,
      "reward": 1.1115014195442199,
      "reward_std": 0.3146645646542311,
      "rewards/JointRewardFunction/mean": 1.1115014195442199,
      "rewards/JointRewardFunction/std": 0.3146645750850439,
      "step": 150,
      "step_time": 22.151629410100213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.225,
      "completions/max_length": 501.1,
      "completions/max_terminated_length": 447.8,
      "completions/mean_length": 400.075,
      "completions/mean_terminated_length": 361.0351318359375,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "entropy": 0.4785544477403164,
      "epoch": 1.0666666666666667,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.0,
      "kl": 0.0879205574747175,
      "learning_rate": 4.7e-06,
      "loss": 0.017533975839614867,
      "num_tokens": 716935.0,
      "reward": 1.1597753405570983,
      "reward_std": 0.29188001044094564,
      "rewards/JointRewardFunction/mean": 1.1597753405570983,
      "rewards/JointRewardFunction/std": 0.29188000336289405,
      "step": 160,
      "step_time": 22.98355916679975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 467.4,
      "completions/max_terminated_length": 460.7,
      "completions/mean_length": 363.275,
      "completions/mean_terminated_length": 354.8253631591797,
      "completions/min_length": 251.6,
      "completions/min_terminated_length": 251.6,
      "entropy": 0.5262451708316803,
      "epoch": 1.1333333333333333,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.060546875,
      "kl": 0.09955117981880904,
      "learning_rate": 4.366666666666667e-06,
      "loss": -0.007418201863765716,
      "num_tokens": 757865.0,
      "reward": 1.1451659560203553,
      "reward_std": 0.3273365020751953,
      "rewards/JointRewardFunction/mean": 1.1451659560203553,
      "rewards/JointRewardFunction/std": 0.3273364961147308,
      "step": 170,
      "step_time": 21.108805562603084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 452.5,
      "completions/max_terminated_length": 448.2,
      "completions/mean_length": 337.25,
      "completions/mean_terminated_length": 335.1607147216797,
      "completions/min_length": 241.7,
      "completions/min_terminated_length": 241.7,
      "entropy": 0.5567022401839494,
      "epoch": 1.2,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.068359375,
      "kl": 0.09737768466584384,
      "learning_rate": 4.033333333333333e-06,
      "loss": 0.018832828104496,
      "num_tokens": 796505.0,
      "reward": 1.2489745497703553,
      "reward_std": 0.11990191522636451,
      "rewards/JointRewardFunction/mean": 1.2489745497703553,
      "rewards/JointRewardFunction/std": 0.11990190770593472,
      "step": 180,
      "step_time": 20.626747212697957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1375,
      "completions/max_length": 473.7,
      "completions/max_terminated_length": 441.8,
      "completions/mean_length": 373.1875,
      "completions/mean_terminated_length": 352.4538177490234,
      "completions/min_length": 259.1,
      "completions/min_terminated_length": 259.1,
      "entropy": 0.5100005997344852,
      "epoch": 1.2666666666666666,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.039794921875,
      "kl": 0.09228390976786613,
      "learning_rate": 3.7e-06,
      "loss": -0.000696965865790844,
      "num_tokens": 839088.0,
      "reward": 1.23026362657547,
      "reward_std": 0.14840476661920549,
      "rewards/JointRewardFunction/mean": 1.23026362657547,
      "rewards/JointRewardFunction/std": 0.14840476512908934,
      "step": 190,
      "step_time": 21.574640466592246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1375,
      "completions/max_length": 480.2,
      "completions/max_terminated_length": 459.8,
      "completions/mean_length": 389.75,
      "completions/mean_terminated_length": 372.77679443359375,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "entropy": 0.5184403497725725,
      "epoch": 1.3333333333333333,
      "frac_reward_zero_std": 0.55,
      "grad_norm": 2.1875,
      "kl": 0.08728140082675964,
      "learning_rate": 3.366666666666667e-06,
      "loss": 0.021845726668834685,
      "num_tokens": 883220.0,
      "reward": 1.181435489654541,
      "reward_std": 0.24119414222077468,
      "rewards/JointRewardFunction/mean": 1.181435489654541,
      "rewards/JointRewardFunction/std": 0.24119413328007794,
      "step": 200,
      "step_time": 21.928263508094822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 460.3,
      "completions/max_terminated_length": 452.6,
      "completions/mean_length": 375.4625,
      "completions/mean_terminated_length": 366.30833740234374,
      "completions/min_length": 278.2,
      "completions/min_terminated_length": 278.2,
      "entropy": 0.5214369297027588,
      "epoch": 1.4,
      "frac_reward_zero_std": 0.55,
      "grad_norm": 1.4375,
      "kl": 0.0887975221965462,
      "learning_rate": 3.0333333333333337e-06,
      "loss": 0.01691504716873169,
      "num_tokens": 926517.0,
      "reward": 1.2304613709449768,
      "reward_std": 0.189005006296793,
      "rewards/JointRewardFunction/mean": 1.2304613709449768,
      "rewards/JointRewardFunction/std": 0.18900499549345112,
      "step": 210,
      "step_time": 21.30131801480311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1,
      "completions/max_length": 470.3,
      "completions/max_terminated_length": 448.9,
      "completions/mean_length": 365.95,
      "completions/mean_terminated_length": 351.73179016113284,
      "completions/min_length": 272.3,
      "completions/min_terminated_length": 272.3,
      "entropy": 0.5099331840872765,
      "epoch": 1.4666666666666668,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 2.0,
      "kl": 0.09267634809948504,
      "learning_rate": 2.7000000000000004e-06,
      "loss": -0.004276449233293534,
      "num_tokens": 969621.0,
      "reward": 1.0979979991912843,
      "reward_std": 0.326889356970787,
      "rewards/JointRewardFunction/mean": 1.0979979991912843,
      "rewards/JointRewardFunction/std": 0.32688935101032257,
      "step": 220,
      "step_time": 21.54177276209375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 437.0,
      "completions/max_terminated_length": 435.8,
      "completions/mean_length": 325.4375,
      "completions/mean_terminated_length": 323.2982147216797,
      "completions/min_length": 243.3,
      "completions/min_terminated_length": 243.3,
      "entropy": 0.527520533464849,
      "epoch": 1.5333333333333332,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 3.609375,
      "kl": 0.09654896147549152,
      "learning_rate": 2.3666666666666667e-06,
      "loss": 0.004230192676186561,
      "num_tokens": 1007468.0,
      "reward": 1.1674120664596557,
      "reward_std": 0.3256095230579376,
      "rewards/JointRewardFunction/mean": 1.1674120664596557,
      "rewards/JointRewardFunction/std": 0.32560951411724093,
      "step": 230,
      "step_time": 20.321823318899987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 471.3,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 387.2,
      "completions/mean_terminated_length": 362.8750030517578,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "entropy": 0.5140835266560316,
      "epoch": 1.6,
      "frac_reward_zero_std": 0.35,
      "grad_norm": 1.65625,
      "kl": 0.08105785646475852,
      "learning_rate": 2.0333333333333335e-06,
      "loss": 0.0024391064420342446,
      "num_tokens": 1053144.0,
      "reward": 1.1341503262519836,
      "reward_std": 0.3189578216522932,
      "rewards/JointRewardFunction/mean": 1.1341503262519836,
      "rewards/JointRewardFunction/std": 0.3189578127115965,
      "step": 240,
      "step_time": 21.733667162401254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 476.7,
      "completions/max_terminated_length": 449.8,
      "completions/mean_length": 376.975,
      "completions/mean_terminated_length": 357.80071716308595,
      "completions/min_length": 280.1,
      "completions/min_terminated_length": 280.1,
      "entropy": 0.5188658468425273,
      "epoch": 1.6666666666666665,
      "frac_reward_zero_std": 0.65,
      "grad_norm": 1.40625,
      "kl": 0.09223291147500276,
      "learning_rate": 1.7000000000000002e-06,
      "loss": 0.003794506937265396,
      "num_tokens": 1096622.0,
      "reward": 1.2100683093070983,
      "reward_std": 0.1948750299634412,
      "rewards/JointRewardFunction/mean": 1.2100683093070983,
      "rewards/JointRewardFunction/std": 0.19487502239644527,
      "step": 250,
      "step_time": 21.73429901890122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 478.8,
      "completions/max_terminated_length": 468.3,
      "completions/mean_length": 366.125,
      "completions/mean_terminated_length": 358.8738159179687,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "entropy": 0.547977139428258,
      "epoch": 1.7333333333333334,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 1.734375,
      "kl": 0.09155708220787347,
      "learning_rate": 1.3666666666666668e-06,
      "loss": -0.014781329035758971,
      "num_tokens": 1138920.0,
      "reward": 1.15995112657547,
      "reward_std": 0.32106488235294817,
      "rewards/JointRewardFunction/mean": 1.15995112657547,
      "rewards/JointRewardFunction/std": 0.32106486298143866,
      "step": 260,
      "step_time": 21.99600164630174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1375,
      "completions/max_length": 463.6,
      "completions/max_terminated_length": 438.9,
      "completions/mean_length": 375.1,
      "completions/mean_terminated_length": 358.83917541503905,
      "completions/min_length": 273.2,
      "completions/min_terminated_length": 273.2,
      "entropy": 0.5359004020690918,
      "epoch": 1.8,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 1.9140625,
      "kl": 0.087837297283113,
      "learning_rate": 1.0333333333333333e-06,
      "loss": 0.012757700681686402,
      "num_tokens": 1182736.0,
      "reward": 1.213478970527649,
      "reward_std": 0.17389502958394587,
      "rewards/JointRewardFunction/mean": 1.213478970527649,
      "rewards/JointRewardFunction/std": 0.17389501919969916,
      "step": 270,
      "step_time": 21.312464608701703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0875,
      "completions/max_length": 480.7,
      "completions/max_terminated_length": 456.9,
      "completions/mean_length": 372.65,
      "completions/mean_terminated_length": 360.74524841308596,
      "completions/min_length": 277.9,
      "completions/min_terminated_length": 277.9,
      "entropy": 0.5030612323433161,
      "epoch": 1.8666666666666667,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 1.4140625,
      "kl": 0.08702772008255125,
      "learning_rate": 7.000000000000001e-07,
      "loss": 0.019132000207901,
      "num_tokens": 1225656.0,
      "reward": 1.2122631311416625,
      "reward_std": 0.19066368174389936,
      "rewards/JointRewardFunction/mean": 1.2122631311416625,
      "rewards/JointRewardFunction/std": 0.19066367280320265,
      "step": 280,
      "step_time": 21.90045202969777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2125,
      "completions/max_length": 477.3,
      "completions/max_terminated_length": 440.9,
      "completions/mean_length": 382.5,
      "completions/mean_terminated_length": 354.6571441650391,
      "completions/min_length": 282.6,
      "completions/min_terminated_length": 282.6,
      "entropy": 0.5357337659224868,
      "epoch": 1.9333333333333333,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.765625,
      "kl": 0.0854645582381636,
      "learning_rate": 3.666666666666667e-07,
      "loss": -0.011803697794675827,
      "num_tokens": 1272012.0,
      "reward": 1.1109179258346558,
      "reward_std": 0.31746507063508034,
      "rewards/JointRewardFunction/mean": 1.1109179258346558,
      "rewards/JointRewardFunction/std": 0.31746507063508034,
      "step": 290,
      "step_time": 21.865317538700765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.175,
      "completions/max_length": 476.9,
      "completions/max_terminated_length": 435.9,
      "completions/mean_length": 365.8,
      "completions/mean_terminated_length": 339.0707153320312,
      "completions/min_length": 260.3,
      "completions/min_terminated_length": 260.3,
      "entropy": 0.5455268980935216,
      "epoch": 2.0,
      "frac_reward_zero_std": 0.45,
      "grad_norm": 1.546875,
      "kl": 0.08734047506004572,
      "learning_rate": 3.333333333333334e-08,
      "loss": -0.005074360966682434,
      "num_tokens": 1314476.0,
      "reward": 1.0770751595497132,
      "reward_std": 0.38892474174499514,
      "rewards/JointRewardFunction/mean": 1.0770751595497132,
      "rewards/JointRewardFunction/std": 0.3889247328042984,
      "step": 300,
      "step_time": 21.719864126896574
    }
  ],
  "logging_steps": 10,
  "max_steps": 300,
  "num_input_tokens_seen": 1314476,
  "num_train_epochs": 2,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}