expfinal-qwen-island-s42-la…/last-checkpoint/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.28,
  "eval_steps": 500,
  "global_step": 160,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 894.8,
      "completions/max_terminated_length": 821.2,
      "completions/mean_length": 560.2,
      "completions/mean_terminated_length": 540.6345306396485,
      "completions/min_length": 279.5,
      "completions/min_terminated_length": 279.5,
      "entropy": 0.38268125932663677,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.4375,
      "kl": 0.07789193278222228,
      "learning_rate": 9.4375e-06,
      "loss": -0.007836591452360153,
      "num_tokens": 65460.0,
      "reward": 0.65,
      "reward_std": 0.46797851026058196,
      "rewards/JointRewardFunction/mean": 0.65,
      "rewards/JointRewardFunction/std": 0.4679785281419754,
      "step": 10,
      "step_time": 36.23508502000004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 900.1,
      "completions/max_terminated_length": 823.7,
      "completions/mean_length": 578.525,
      "completions/mean_terminated_length": 543.8116729736328,
      "completions/min_length": 254.6,
      "completions/min_terminated_length": 254.6,
      "entropy": 0.23411482032388448,
      "epoch": 0.16,
      "frac_reward_zero_std": 0.775,
      "grad_norm": 1.1484375,
      "kl": 0.1648747116792947,
      "learning_rate": 8.8125e-06,
      "loss": 0.0052785202860832214,
      "num_tokens": 132386.0,
      "reward": 0.6625,
      "reward_std": 0.46628117859363555,
      "rewards/JointRewardFunction/mean": 0.6625,
      "rewards/JointRewardFunction/std": 0.4662812024354935,
      "step": 20,
      "step_time": 37.938748809599566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 876.9,
      "completions/max_terminated_length": 829.5,
      "completions/mean_length": 622.2,
      "completions/mean_terminated_length": 611.7642944335937,
      "completions/min_length": 477.6,
      "completions/min_terminated_length": 477.6,
      "entropy": 0.22628286899998784,
      "epoch": 0.24,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.0218505859375,
      "kl": 0.12199527090415359,
      "learning_rate": 8.1875e-06,
      "loss": 0.007959160953760147,
      "num_tokens": 202606.0,
      "reward": 0.9375,
      "reward_std": 0.12246559262275696,
      "rewards/JointRewardFunction/mean": 0.9375,
      "rewards/JointRewardFunction/std": 0.12246559858322144,
      "step": 30,
      "step_time": 37.057444848399975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 810.6,
      "completions/max_terminated_length": 731.4,
      "completions/mean_length": 596.8625,
      "completions/mean_terminated_length": 584.707144165039,
      "completions/min_length": 438.4,
      "completions/min_terminated_length": 438.4,
      "entropy": 0.42070485297590493,
      "epoch": 0.32,
      "frac_reward_zero_std": 0.85,
      "grad_norm": 1.1015625,
      "kl": 0.10627949037589132,
      "learning_rate": 7.5625e-06,
      "loss": -0.00913204848766327,
      "num_tokens": 270929.0,
      "reward": 0.9,
      "reward_std": 0.20411193668842315,
      "rewards/JointRewardFunction/mean": 0.9,
      "rewards/JointRewardFunction/std": 0.20411194264888763,
      "step": 40,
      "step_time": 34.16536340820039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 916.6,
      "completions/max_terminated_length": 800.7,
      "completions/mean_length": 642.275,
      "completions/mean_terminated_length": 620.6982238769531,
      "completions/min_length": 483.4,
      "completions/min_terminated_length": 483.4,
      "entropy": 0.545534435659647,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.825,
      "grad_norm": 0.9921875,
      "kl": 0.09743389897048474,
      "learning_rate": 6.9375e-06,
      "loss": 0.019783291220664977,
      "num_tokens": 343141.0,
      "reward": 0.9125,
      "reward_std": 0.19864802658557892,
      "rewards/JointRewardFunction/mean": 0.9125,
      "rewards/JointRewardFunction/std": 0.19864802658557892,
      "step": 50,
      "step_time": 38.55176728389906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 774.9,
      "completions/max_terminated_length": 774.9,
      "completions/mean_length": 599.525,
      "completions/mean_terminated_length": 599.525,
      "completions/min_length": 438.8,
      "completions/min_terminated_length": 438.8,
      "entropy": 0.5808290097862482,
      "epoch": 0.48,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.035400390625,
      "kl": 0.10702053690329194,
      "learning_rate": 6.3125e-06,
      "loss": -0.009540864825248718,
      "num_tokens": 411443.0,
      "reward": 0.9625,
      "reward_std": 0.10606601536273956,
      "rewards/JointRewardFunction/mean": 0.9625,
      "rewards/JointRewardFunction/std": 0.10606601536273956,
      "step": 60,
      "step_time": 33.111361246699744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0375,
      "completions/max_length": 848.1,
      "completions/max_terminated_length": 757.4,
      "completions/mean_length": 626.325,
      "completions/mean_terminated_length": 609.8857238769531,
      "completions/min_length": 511.5,
      "completions/min_terminated_length": 511.5,
      "entropy": 0.4436331996694207,
      "epoch": 0.56,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.060791015625,
      "kl": 0.08952742610126734,
      "learning_rate": 5.6875e-06,
      "loss": 0.014259077608585358,
      "num_tokens": 482005.0,
      "reward": 0.95,
      "reward_std": 0.11700168251991272,
      "rewards/JointRewardFunction/mean": 0.95,
      "rewards/JointRewardFunction/std": 0.11700168251991272,
      "step": 70,
      "step_time": 35.991650615099935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 787.7,
      "completions/max_terminated_length": 747.8,
      "completions/mean_length": 593.25,
      "completions/mean_terminated_length": 587.5642883300782,
      "completions/min_length": 493.6,
      "completions/min_terminated_length": 493.6,
      "entropy": 0.328166064620018,
      "epoch": 0.64,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.017822265625,
      "kl": 0.1307119549252093,
      "learning_rate": 5.0625e-06,
      "loss": 0.0031396135687828063,
      "num_tokens": 550079.0,
      "reward": 0.975,
      "reward_std": 0.07071067690849304,
      "rewards/JointRewardFunction/mean": 0.975,
      "rewards/JointRewardFunction/std": 0.07071067690849304,
      "step": 80,
      "step_time": 33.55307105900029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 788.4,
      "completions/max_terminated_length": 788.4,
      "completions/mean_length": 619.825,
      "completions/mean_terminated_length": 619.825,
      "completions/min_length": 510.4,
      "completions/min_terminated_length": 510.4,
      "entropy": 0.4017201948910952,
      "epoch": 0.72,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0172119140625,
      "kl": 0.08959094756282866,
      "learning_rate": 4.4375e-06,
      "loss": 0.001467562187463045,
      "num_tokens": 620285.0,
      "reward": 0.9625,
      "reward_std": 0.10606601536273956,
      "rewards/JointRewardFunction/mean": 0.9625,
      "rewards/JointRewardFunction/std": 0.10606601536273956,
      "step": 90,
      "step_time": 33.41816141909967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 911.0,
      "completions/max_terminated_length": 831.5,
      "completions/mean_length": 671.025,
      "completions/mean_terminated_length": 653.7214416503906,
      "completions/min_length": 537.5,
      "completions/min_terminated_length": 537.5,
      "entropy": 0.4100338226184249,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.021240234375,
      "kl": 0.07856191159226,
      "learning_rate": 3.8125e-06,
      "loss": 0.017280958592891693,
      "num_tokens": 694633.0,
      "reward": 0.9375,
      "reward_std": 0.1767766922712326,
      "rewards/JointRewardFunction/mean": 0.9375,
      "rewards/JointRewardFunction/std": 0.1767766922712326,
      "step": 100,
      "step_time": 38.310592102500415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 867.4,
      "completions/max_terminated_length": 779.6,
      "completions/mean_length": 642.0875,
      "completions/mean_terminated_length": 616.8726257324219,
      "completions/min_length": 501.5,
      "completions/min_terminated_length": 501.5,
      "entropy": 0.40164962466806176,
      "epoch": 0.88,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 1.1328125,
      "kl": 0.0828359558712691,
      "learning_rate": 3.1875e-06,
      "loss": 0.009746464341878891,
      "num_tokens": 766544.0,
      "reward": 0.9625,
      "reward_std": 0.0816463440656662,
      "rewards/JointRewardFunction/mean": 0.9625,
      "rewards/JointRewardFunction/std": 0.0816463440656662,
      "step": 110,
      "step_time": 36.619024862399236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0375,
      "completions/max_length": 807.3,
      "completions/max_terminated_length": 713.2,
      "completions/mean_length": 611.7875,
      "completions/mean_terminated_length": 595.7928649902344,
      "completions/min_length": 508.7,
      "completions/min_terminated_length": 508.7,
      "entropy": 0.40618473663926125,
      "epoch": 0.96,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 1.03125,
      "kl": 0.08407443668693304,
      "learning_rate": 2.5625e-06,
      "loss": 0.01084473505616188,
      "num_tokens": 835859.0,
      "reward": 0.9625,
      "reward_std": 0.10606601536273956,
      "rewards/JointRewardFunction/mean": 0.9625,
      "rewards/JointRewardFunction/std": 0.10606601536273956,
      "step": 120,
      "step_time": 34.16594214020042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 751.9,
      "completions/max_terminated_length": 748.3,
      "completions/mean_length": 631.1,
      "completions/mean_terminated_length": 626.9517883300781,
      "completions/min_length": 544.1,
      "completions/min_terminated_length": 544.1,
      "entropy": 0.42196682561188936,
      "epoch": 1.04,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.041748046875,
      "kl": 0.08644672441296279,
      "learning_rate": 1.9375e-06,
      "loss": 0.00017178469570353628,
      "num_tokens": 906965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/JointRewardFunction/mean": 1.0,
      "rewards/JointRewardFunction/std": 0.0,
      "step": 130,
      "step_time": 31.96281487729957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 837.6,
      "completions/max_terminated_length": 826.9,
      "completions/mean_length": 642.5625,
      "completions/mean_terminated_length": 637.825,
      "completions/min_length": 522.1,
      "completions/min_terminated_length": 522.1,
      "entropy": 0.39782516546547414,
      "epoch": 1.12,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9375,
      "kl": 0.08095719190314413,
      "learning_rate": 1.3125000000000001e-06,
      "loss": 0.0059084448963403705,
      "num_tokens": 979018.0,
      "reward": 0.9375,
      "reward_std": 0.1767766922712326,
      "rewards/JointRewardFunction/mean": 0.9375,
      "rewards/JointRewardFunction/std": 0.1767766922712326,
      "step": 140,
      "step_time": 35.47339765110009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 780.2,
      "completions/max_terminated_length": 709.4,
      "completions/mean_length": 608.125,
      "completions/mean_terminated_length": 597.3017883300781,
      "completions/min_length": 514.2,
      "completions/min_terminated_length": 514.2,
      "entropy": 0.3825716434046626,
      "epoch": 1.2,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0263671875,
      "kl": 0.0812916701193899,
      "learning_rate": 6.875000000000001e-07,
      "loss": 0.014202636480331422,
      "num_tokens": 1048244.0,
      "reward": 0.95,
      "reward_std": 0.11700168251991272,
      "rewards/JointRewardFunction/mean": 0.95,
      "rewards/JointRewardFunction/std": 0.11700168251991272,
      "step": 150,
      "step_time": 33.213279404800595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 785.1,
      "completions/max_terminated_length": 743.9,
      "completions/mean_length": 621.9,
      "completions/mean_terminated_length": 612.2803649902344,
      "completions/min_length": 508.9,
      "completions/min_terminated_length": 508.9,
      "entropy": 0.4208029452711344,
      "epoch": 1.28,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.92578125,
      "kl": 0.08554110652767122,
      "learning_rate": 6.250000000000001e-08,
      "loss": 0.011326169967651368,
      "num_tokens": 1118898.0,
      "reward": 0.975,
      "reward_std": 0.07071067690849304,
      "rewards/JointRewardFunction/mean": 0.975,
      "rewards/JointRewardFunction/std": 0.07071067690849304,
      "step": 160,
      "step_time": 33.45168144740146
    }
  ],
  "logging_steps": 10,
  "max_steps": 160,
  "num_input_tokens_seen": 1118898,
  "num_train_epochs": 2,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}