socialcontract-policy-7b-v1/checkpoint-50/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.7575757575757576,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 177.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.8,
      "completions/max_terminated_length": 177.8,
      "completions/mean_length": 157.85000610351562,
      "completions/mean_terminated_length": 157.85000610351562,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.07575757575757576,
      "frac_reward_zero_std": 0.4000000059604645,
      "grad_norm": 1.5840047597885132,
      "kl": 0.0010059793893522702,
      "learning_rate": 1.6000000000000001e-06,
      "loss": 0.0,
      "num_tokens": 73447.0,
      "reward": 0.5880883574485779,
      "reward_std": 0.020529226586222648,
      "rewards/reward_function/mean": 0.5880883395671844,
      "rewards/reward_function/std": 0.06562883183360099,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 176.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.2,
      "completions/max_terminated_length": 176.2,
      "completions/mean_length": 156.65000915527344,
      "completions/mean_terminated_length": 156.65000915527344,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.15151515151515152,
      "frac_reward_zero_std": 0.26666667461395266,
      "grad_norm": 1.6390373706817627,
      "kl": 0.0017284046276472508,
      "learning_rate": 3.6000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 146334.0,
      "reward": 0.605418348312378,
      "reward_std": 0.02508251890540123,
      "rewards/reward_function/mean": 0.60541832447052,
      "rewards/reward_function/std": 0.06859094277024269,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 176.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.4,
      "completions/max_terminated_length": 176.4,
      "completions/mean_length": 157.0000030517578,
      "completions/mean_terminated_length": 157.0000030517578,
      "completions/min_length": 140.8,
      "completions/min_terminated_length": 140.8,
      "epoch": 0.22727272727272727,
      "frac_reward_zero_std": 0.26666667461395266,
      "grad_norm": 0.7626600861549377,
      "kl": 0.003397522373901059,
      "learning_rate": 5.600000000000001e-06,
      "loss": 0.0,
      "num_tokens": 219198.0,
      "reward": 0.5862850427627564,
      "reward_std": 0.036518129706382754,
      "rewards/reward_function/mean": 0.5862850069999694,
      "rewards/reward_function/std": 0.08488646671175956,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 183.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 156.98334045410155,
      "completions/mean_terminated_length": 156.98334045410155,
      "completions/min_length": 139.8,
      "completions/min_terminated_length": 139.8,
      "epoch": 0.30303030303030304,
      "frac_reward_zero_std": 0.26666667461395266,
      "grad_norm": 0.9624250531196594,
      "kl": 0.007015585945919156,
      "learning_rate": 7.600000000000001e-06,
      "loss": 0.0,
      "num_tokens": 291737.0,
      "reward": 0.6001700401306153,
      "reward_std": 0.025772593356668948,
      "rewards/reward_function/mean": 0.6001700043678284,
      "rewards/reward_function/std": 0.07909451425075531,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 172.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.2,
      "completions/max_terminated_length": 172.2,
      "completions/mean_length": 155.23333740234375,
      "completions/mean_terminated_length": 155.23333740234375,
      "completions/min_length": 138.2,
      "completions/min_terminated_length": 138.2,
      "epoch": 0.3787878787878788,
      "frac_reward_zero_std": 0.26666667461395266,
      "grad_norm": 0.5355867743492126,
      "kl": 0.009492208405087391,
      "learning_rate": 9.600000000000001e-06,
      "loss": 0.0,
      "num_tokens": 364535.0,
      "reward": 0.5766633510589599,
      "reward_std": 0.04085115455091,
      "rewards/reward_function/mean": 0.576663339138031,
      "rewards/reward_function/std": 0.10587597712874412,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 198.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.4,
      "completions/max_terminated_length": 198.4,
      "completions/mean_length": 153.6166748046875,
      "completions/mean_terminated_length": 153.6166748046875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.45454545454545453,
      "frac_reward_zero_std": 0.13333333730697633,
      "grad_norm": 0.6549646854400635,
      "kl": 0.061492755884925525,
      "learning_rate": 1.16e-05,
      "loss": 0.0001,
      "num_tokens": 436992.0,
      "reward": 0.5978150248527527,
      "reward_std": 0.04262940138578415,
      "rewards/reward_function/mean": 0.5978150129318237,
      "rewards/reward_function/std": 0.09431936666369438,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 176.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 158.86667175292968,
      "completions/mean_terminated_length": 158.86667175292968,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.5303030303030303,
      "frac_reward_zero_std": 0.40000001192092893,
      "grad_norm": 0.4725801348686218,
      "kl": 2.357983988771836,
      "learning_rate": 1.3600000000000002e-05,
      "loss": 0.0024,
      "num_tokens": 509788.0,
      "reward": 0.6049700140953064,
      "reward_std": 0.012831439916044473,
      "rewards/reward_function/mean": 0.6049699783325195,
      "rewards/reward_function/std": 0.08928216472268105,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 172.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.4,
      "completions/max_terminated_length": 172.4,
      "completions/mean_length": 154.5500030517578,
      "completions/mean_terminated_length": 154.5500030517578,
      "completions/min_length": 138.8,
      "completions/min_terminated_length": 138.8,
      "epoch": 0.6060606060606061,
      "frac_reward_zero_std": 0.4666666746139526,
      "grad_norm": 0.41579461097717285,
      "kl": 0.10282722649474939,
      "learning_rate": 1.5600000000000003e-05,
      "loss": 0.0001,
      "num_tokens": 582789.0,
      "reward": 0.5595033764839172,
      "reward_std": 0.014409982354845852,
      "rewards/reward_function/mean": 0.5595033466815948,
      "rewards/reward_function/std": 0.053104204079136255,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 202.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.6,
      "completions/max_terminated_length": 202.6,
      "completions/mean_length": 158.20000305175782,
      "completions/mean_terminated_length": 158.20000305175782,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.6818181818181818,
      "frac_reward_zero_std": 0.33333333730697634,
      "grad_norm": 0.03286667913198471,
      "kl": 3726.0936788400013,
      "learning_rate": 1.76e-05,
      "loss": 3.7261,
      "num_tokens": 655565.0,
      "reward": 0.5829650402069092,
      "reward_std": 0.031194474175572397,
      "rewards/reward_function/mean": 0.5829649925231933,
      "rewards/reward_function/std": 0.09724260903894902,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completion_length": 180.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.4,
      "completions/max_terminated_length": 180.4,
      "completions/mean_length": 157.5500030517578,
      "completions/mean_terminated_length": 157.5500030517578,
      "completions/min_length": 141.2,
      "completions/min_terminated_length": 141.2,
      "epoch": 0.7575757575757576,
      "frac_reward_zero_std": 0.26666667461395266,
      "grad_norm": 0.45231395959854126,
      "kl": 0.24270717451969784,
      "learning_rate": 1.9600000000000002e-05,
      "loss": 0.0002,
      "num_tokens": 727754.0,
      "reward": 0.6422233581542969,
      "reward_std": 0.015453202556818724,
      "rewards/reward_function/mean": 0.642223310470581,
      "rewards/reward_function/std": 0.08873879238963127,
      "step": 50
    }
  ],
  "logging_steps": 5,
  "max_steps": 500,
  "num_input_tokens_seen": 727754,
  "num_train_epochs": 8,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}
初始化项目，由ModelHub XC社区提供模型 Model: Tyr-123/socialcontract-policy-7b-v1 Source: Original Platform 2026-04-29 22:34:48 +08:00			`{`
			`"best_global_step": null,`
			`"best_metric": null,`
			`"best_model_checkpoint": null,`
			`"epoch": 0.7575757575757576,`
			`"eval_steps": 500,`
			`"global_step": 50,`
			`"is_hyper_param_search": false,`
			`"is_local_process_zero": true,`
			`"is_world_process_zero": true,`
			`"log_history": [`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 177.8,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 177.8,`
			`"completions/max_terminated_length": 177.8,`
			`"completions/mean_length": 157.85000610351562,`
			`"completions/mean_terminated_length": 157.85000610351562,`
			`"completions/min_length": 136.0,`
			`"completions/min_terminated_length": 136.0,`
			`"epoch": 0.07575757575757576,`
			`"frac_reward_zero_std": 0.4000000059604645,`
			`"grad_norm": 1.5840047597885132,`
			`"kl": 0.0010059793893522702,`
			`"learning_rate": 1.6000000000000001e-06,`
			`"loss": 0.0,`
			`"num_tokens": 73447.0,`
			`"reward": 0.5880883574485779,`
			`"reward_std": 0.020529226586222648,`
			`"rewards/reward_function/mean": 0.5880883395671844,`
			`"rewards/reward_function/std": 0.06562883183360099,`
			`"step": 5`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 176.2,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 176.2,`
			`"completions/max_terminated_length": 176.2,`
			`"completions/mean_length": 156.65000915527344,`
			`"completions/mean_terminated_length": 156.65000915527344,`
			`"completions/min_length": 138.0,`
			`"completions/min_terminated_length": 138.0,`
			`"epoch": 0.15151515151515152,`
			`"frac_reward_zero_std": 0.26666667461395266,`
			`"grad_norm": 1.6390373706817627,`
			`"kl": 0.0017284046276472508,`
			`"learning_rate": 3.6000000000000003e-06,`
			`"loss": 0.0,`
			`"num_tokens": 146334.0,`
			`"reward": 0.605418348312378,`
			`"reward_std": 0.02508251890540123,`
			`"rewards/reward_function/mean": 0.60541832447052,`
			`"rewards/reward_function/std": 0.06859094277024269,`
			`"step": 10`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 176.4,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 176.4,`
			`"completions/max_terminated_length": 176.4,`
			`"completions/mean_length": 157.0000030517578,`
			`"completions/mean_terminated_length": 157.0000030517578,`
			`"completions/min_length": 140.8,`
			`"completions/min_terminated_length": 140.8,`
			`"epoch": 0.22727272727272727,`
			`"frac_reward_zero_std": 0.26666667461395266,`
			`"grad_norm": 0.7626600861549377,`
			`"kl": 0.003397522373901059,`
			`"learning_rate": 5.600000000000001e-06,`
			`"loss": 0.0,`
			`"num_tokens": 219198.0,`
			`"reward": 0.5862850427627564,`
			`"reward_std": 0.036518129706382754,`
			`"rewards/reward_function/mean": 0.5862850069999694,`
			`"rewards/reward_function/std": 0.08488646671175956,`
			`"step": 15`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 183.0,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 183.0,`
			`"completions/max_terminated_length": 183.0,`
			`"completions/mean_length": 156.98334045410155,`
			`"completions/mean_terminated_length": 156.98334045410155,`
			`"completions/min_length": 139.8,`
			`"completions/min_terminated_length": 139.8,`
			`"epoch": 0.30303030303030304,`
			`"frac_reward_zero_std": 0.26666667461395266,`
			`"grad_norm": 0.9624250531196594,`
			`"kl": 0.007015585945919156,`
			`"learning_rate": 7.600000000000001e-06,`
			`"loss": 0.0,`
			`"num_tokens": 291737.0,`
			`"reward": 0.6001700401306153,`
			`"reward_std": 0.025772593356668948,`
			`"rewards/reward_function/mean": 0.6001700043678284,`
			`"rewards/reward_function/std": 0.07909451425075531,`
			`"step": 20`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 172.2,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 172.2,`
			`"completions/max_terminated_length": 172.2,`
			`"completions/mean_length": 155.23333740234375,`
			`"completions/mean_terminated_length": 155.23333740234375,`
			`"completions/min_length": 138.2,`
			`"completions/min_terminated_length": 138.2,`
			`"epoch": 0.3787878787878788,`
			`"frac_reward_zero_std": 0.26666667461395266,`
			`"grad_norm": 0.5355867743492126,`
			`"kl": 0.009492208405087391,`
			`"learning_rate": 9.600000000000001e-06,`
			`"loss": 0.0,`
			`"num_tokens": 364535.0,`
			`"reward": 0.5766633510589599,`
			`"reward_std": 0.04085115455091,`
			`"rewards/reward_function/mean": 0.576663339138031,`
			`"rewards/reward_function/std": 0.10587597712874412,`
			`"step": 25`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 198.4,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 198.4,`
			`"completions/max_terminated_length": 198.4,`
			`"completions/mean_length": 153.6166748046875,`
			`"completions/mean_terminated_length": 153.6166748046875,`
			`"completions/min_length": 136.0,`
			`"completions/min_terminated_length": 136.0,`
			`"epoch": 0.45454545454545453,`
			`"frac_reward_zero_std": 0.13333333730697633,`
			`"grad_norm": 0.6549646854400635,`
			`"kl": 0.061492755884925525,`
			`"learning_rate": 1.16e-05,`
			`"loss": 0.0001,`
			`"num_tokens": 436992.0,`
			`"reward": 0.5978150248527527,`
			`"reward_std": 0.04262940138578415,`
			`"rewards/reward_function/mean": 0.5978150129318237,`
			`"rewards/reward_function/std": 0.09431936666369438,`
			`"step": 30`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 176.0,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 176.0,`
			`"completions/max_terminated_length": 176.0,`
			`"completions/mean_length": 158.86667175292968,`
			`"completions/mean_terminated_length": 158.86667175292968,`
			`"completions/min_length": 145.0,`
			`"completions/min_terminated_length": 145.0,`
			`"epoch": 0.5303030303030303,`
			`"frac_reward_zero_std": 0.40000001192092893,`
			`"grad_norm": 0.4725801348686218,`
			`"kl": 2.357983988771836,`
			`"learning_rate": 1.3600000000000002e-05,`
			`"loss": 0.0024,`
			`"num_tokens": 509788.0,`
			`"reward": 0.6049700140953064,`
			`"reward_std": 0.012831439916044473,`
			`"rewards/reward_function/mean": 0.6049699783325195,`
			`"rewards/reward_function/std": 0.08928216472268105,`
			`"step": 35`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 172.4,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 172.4,`
			`"completions/max_terminated_length": 172.4,`
			`"completions/mean_length": 154.5500030517578,`
			`"completions/mean_terminated_length": 154.5500030517578,`
			`"completions/min_length": 138.8,`
			`"completions/min_terminated_length": 138.8,`
			`"epoch": 0.6060606060606061,`
			`"frac_reward_zero_std": 0.4666666746139526,`
			`"grad_norm": 0.41579461097717285,`
			`"kl": 0.10282722649474939,`
			`"learning_rate": 1.5600000000000003e-05,`
			`"loss": 0.0001,`
			`"num_tokens": 582789.0,`
			`"reward": 0.5595033764839172,`
			`"reward_std": 0.014409982354845852,`
			`"rewards/reward_function/mean": 0.5595033466815948,`
			`"rewards/reward_function/std": 0.053104204079136255,`
			`"step": 40`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 202.6,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 202.6,`
			`"completions/max_terminated_length": 202.6,`
			`"completions/mean_length": 158.20000305175782,`
			`"completions/mean_terminated_length": 158.20000305175782,`
			`"completions/min_length": 136.0,`
			`"completions/min_terminated_length": 136.0,`
			`"epoch": 0.6818181818181818,`
			`"frac_reward_zero_std": 0.33333333730697634,`
			`"grad_norm": 0.03286667913198471,`
			`"kl": 3726.0936788400013,`
			`"learning_rate": 1.76e-05,`
			`"loss": 3.7261,`
			`"num_tokens": 655565.0,`
			`"reward": 0.5829650402069092,`
			`"reward_std": 0.031194474175572397,`
			`"rewards/reward_function/mean": 0.5829649925231933,`
			`"rewards/reward_function/std": 0.09724260903894902,`
			`"step": 45`
			`},`
			`{`
			`"clip_ratio/high_max": 0.0,`
			`"clip_ratio/high_mean": 0.0,`
			`"clip_ratio/low_mean": 0.0,`
			`"clip_ratio/low_min": 0.0,`
			`"clip_ratio/region_mean": 0.0,`
			`"completion_length": 180.4,`
			`"completions/clipped_ratio": 0.0,`
			`"completions/max_length": 180.4,`
			`"completions/max_terminated_length": 180.4,`
			`"completions/mean_length": 157.5500030517578,`
			`"completions/mean_terminated_length": 157.5500030517578,`
			`"completions/min_length": 141.2,`
			`"completions/min_terminated_length": 141.2,`
			`"epoch": 0.7575757575757576,`
			`"frac_reward_zero_std": 0.26666667461395266,`
			`"grad_norm": 0.45231395959854126,`
			`"kl": 0.24270717451969784,`
			`"learning_rate": 1.9600000000000002e-05,`
			`"loss": 0.0002,`
			`"num_tokens": 727754.0,`
			`"reward": 0.6422233581542969,`
			`"reward_std": 0.015453202556818724,`
			`"rewards/reward_function/mean": 0.642223310470581,`
			`"rewards/reward_function/std": 0.08873879238963127,`
			`"step": 50`
			`}`
			`],`
			`"logging_steps": 5,`
			`"max_steps": 500,`
			`"num_input_tokens_seen": 727754,`
			`"num_train_epochs": 8,`
			`"save_steps": 50,`
			`"stateful_callbacks": {`
			`"TrainerControl": {`
			`"args": {`
			`"should_epoch_stop": false,`
			`"should_evaluate": false,`
			`"should_log": false,`
			`"should_save": true,`
			`"should_training_stop": false`
			`},`
			`"attributes": {}`
			`}`
			`},`
			`"total_flos": 0.0,`
			`"train_batch_size": 4,`
			`"trial_name": null,`
			`"trial_params": null`
			`}`