{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.28, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 894.8, "completions/max_terminated_length": 821.2, "completions/mean_length": 560.2, "completions/mean_terminated_length": 540.6345306396485, "completions/min_length": 279.5, "completions/min_terminated_length": 279.5, "entropy": 0.38268125932663677, "epoch": 0.08, "frac_reward_zero_std": 0.75, "grad_norm": 1.4375, "kl": 0.07789193278222228, "learning_rate": 9.4375e-06, "loss": -0.007836591452360153, "num_tokens": 65460.0, "reward": 0.65, "reward_std": 0.46797851026058196, "rewards/JointRewardFunction/mean": 0.65, "rewards/JointRewardFunction/std": 0.4679785281419754, "step": 10, "step_time": 36.23508502000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 900.1, "completions/max_terminated_length": 823.7, "completions/mean_length": 578.525, "completions/mean_terminated_length": 543.8116729736328, "completions/min_length": 254.6, "completions/min_terminated_length": 254.6, "entropy": 0.23411482032388448, "epoch": 0.16, "frac_reward_zero_std": 0.775, "grad_norm": 1.1484375, "kl": 0.1648747116792947, "learning_rate": 8.8125e-06, "loss": 0.0052785202860832214, "num_tokens": 132386.0, "reward": 0.6625, "reward_std": 0.46628117859363555, "rewards/JointRewardFunction/mean": 0.6625, "rewards/JointRewardFunction/std": 0.4662812024354935, "step": 20, "step_time": 37.938748809599566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 876.9, "completions/max_terminated_length": 829.5, "completions/mean_length": 622.2, "completions/mean_terminated_length": 611.7642944335937, "completions/min_length": 477.6, "completions/min_terminated_length": 477.6, "entropy": 0.22628286899998784, "epoch": 0.24, "frac_reward_zero_std": 0.875, "grad_norm": 0.0218505859375, "kl": 0.12199527090415359, "learning_rate": 8.1875e-06, "loss": 0.007959160953760147, "num_tokens": 202606.0, "reward": 0.9375, "reward_std": 0.12246559262275696, "rewards/JointRewardFunction/mean": 0.9375, "rewards/JointRewardFunction/std": 0.12246559858322144, "step": 30, "step_time": 37.057444848399975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 810.6, "completions/max_terminated_length": 731.4, "completions/mean_length": 596.8625, "completions/mean_terminated_length": 584.707144165039, "completions/min_length": 438.4, "completions/min_terminated_length": 438.4, "entropy": 0.42070485297590493, "epoch": 0.32, "frac_reward_zero_std": 0.85, "grad_norm": 1.1015625, "kl": 0.10627949037589132, "learning_rate": 7.5625e-06, "loss": -0.00913204848766327, "num_tokens": 270929.0, "reward": 0.9, "reward_std": 0.20411193668842315, "rewards/JointRewardFunction/mean": 0.9, "rewards/JointRewardFunction/std": 0.20411194264888763, "step": 40, "step_time": 34.16536340820039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 916.6, "completions/max_terminated_length": 800.7, "completions/mean_length": 642.275, "completions/mean_terminated_length": 620.6982238769531, "completions/min_length": 483.4, "completions/min_terminated_length": 483.4, "entropy": 0.545534435659647, "epoch": 0.4, "frac_reward_zero_std": 0.825, "grad_norm": 0.9921875, "kl": 0.09743389897048474, "learning_rate": 6.9375e-06, "loss": 0.019783291220664977, "num_tokens": 343141.0, "reward": 0.9125, "reward_std": 0.19864802658557892, "rewards/JointRewardFunction/mean": 0.9125, "rewards/JointRewardFunction/std": 0.19864802658557892, "step": 50, "step_time": 38.55176728389906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.9, "completions/max_terminated_length": 774.9, "completions/mean_length": 599.525, "completions/mean_terminated_length": 599.525, "completions/min_length": 438.8, "completions/min_terminated_length": 438.8, "entropy": 0.5808290097862482, "epoch": 0.48, "frac_reward_zero_std": 0.925, "grad_norm": 0.035400390625, "kl": 0.10702053690329194, "learning_rate": 6.3125e-06, "loss": -0.009540864825248718, "num_tokens": 411443.0, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/JointRewardFunction/mean": 0.9625, "rewards/JointRewardFunction/std": 0.10606601536273956, "step": 60, "step_time": 33.111361246699744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 848.1, "completions/max_terminated_length": 757.4, "completions/mean_length": 626.325, "completions/mean_terminated_length": 609.8857238769531, "completions/min_length": 511.5, "completions/min_terminated_length": 511.5, "entropy": 0.4436331996694207, "epoch": 0.56, "frac_reward_zero_std": 0.9, "grad_norm": 0.060791015625, "kl": 0.08952742610126734, "learning_rate": 5.6875e-06, "loss": 0.014259077608585358, "num_tokens": 482005.0, "reward": 0.95, "reward_std": 0.11700168251991272, "rewards/JointRewardFunction/mean": 0.95, "rewards/JointRewardFunction/std": 0.11700168251991272, "step": 70, "step_time": 35.991650615099935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 787.7, "completions/max_terminated_length": 747.8, "completions/mean_length": 593.25, "completions/mean_terminated_length": 587.5642883300782, "completions/min_length": 493.6, "completions/min_terminated_length": 493.6, "entropy": 0.328166064620018, "epoch": 0.64, "frac_reward_zero_std": 0.95, "grad_norm": 0.017822265625, "kl": 0.1307119549252093, "learning_rate": 5.0625e-06, "loss": 0.0031396135687828063, "num_tokens": 550079.0, "reward": 0.975, "reward_std": 0.07071067690849304, "rewards/JointRewardFunction/mean": 0.975, "rewards/JointRewardFunction/std": 0.07071067690849304, "step": 80, "step_time": 33.55307105900029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.4, "completions/max_terminated_length": 788.4, "completions/mean_length": 619.825, "completions/mean_terminated_length": 619.825, "completions/min_length": 510.4, "completions/min_terminated_length": 510.4, "entropy": 0.4017201948910952, "epoch": 0.72, "frac_reward_zero_std": 0.925, "grad_norm": 0.0172119140625, "kl": 0.08959094756282866, "learning_rate": 4.4375e-06, "loss": 0.001467562187463045, "num_tokens": 620285.0, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/JointRewardFunction/mean": 0.9625, "rewards/JointRewardFunction/std": 0.10606601536273956, "step": 90, "step_time": 33.41816141909967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 911.0, "completions/max_terminated_length": 831.5, "completions/mean_length": 671.025, "completions/mean_terminated_length": 653.7214416503906, "completions/min_length": 537.5, "completions/min_terminated_length": 537.5, "entropy": 0.4100338226184249, "epoch": 0.8, "frac_reward_zero_std": 0.875, "grad_norm": 0.021240234375, "kl": 0.07856191159226, "learning_rate": 3.8125e-06, "loss": 0.017280958592891693, "num_tokens": 694633.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/JointRewardFunction/mean": 0.9375, "rewards/JointRewardFunction/std": 0.1767766922712326, "step": 100, "step_time": 38.310592102500415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 867.4, "completions/max_terminated_length": 779.6, "completions/mean_length": 642.0875, "completions/mean_terminated_length": 616.8726257324219, "completions/min_length": 501.5, "completions/min_terminated_length": 501.5, "entropy": 0.40164962466806176, "epoch": 0.88, "frac_reward_zero_std": 0.925, "grad_norm": 1.1328125, "kl": 0.0828359558712691, "learning_rate": 3.1875e-06, "loss": 0.009746464341878891, "num_tokens": 766544.0, "reward": 0.9625, "reward_std": 0.0816463440656662, "rewards/JointRewardFunction/mean": 0.9625, "rewards/JointRewardFunction/std": 0.0816463440656662, "step": 110, "step_time": 36.619024862399236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 807.3, "completions/max_terminated_length": 713.2, "completions/mean_length": 611.7875, "completions/mean_terminated_length": 595.7928649902344, "completions/min_length": 508.7, "completions/min_terminated_length": 508.7, "entropy": 0.40618473663926125, "epoch": 0.96, "frac_reward_zero_std": 0.925, "grad_norm": 1.03125, "kl": 0.08407443668693304, "learning_rate": 2.5625e-06, "loss": 0.01084473505616188, "num_tokens": 835859.0, "reward": 0.9625, "reward_std": 0.10606601536273956, "rewards/JointRewardFunction/mean": 0.9625, "rewards/JointRewardFunction/std": 0.10606601536273956, "step": 120, "step_time": 34.16594214020042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 751.9, "completions/max_terminated_length": 748.3, "completions/mean_length": 631.1, "completions/mean_terminated_length": 626.9517883300781, "completions/min_length": 544.1, "completions/min_terminated_length": 544.1, "entropy": 0.42196682561188936, "epoch": 1.04, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.08644672441296279, "learning_rate": 1.9375e-06, "loss": 0.00017178469570353628, "num_tokens": 906965.0, "reward": 1.0, "reward_std": 0.0, "rewards/JointRewardFunction/mean": 1.0, "rewards/JointRewardFunction/std": 0.0, "step": 130, "step_time": 31.96281487729957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 837.6, "completions/max_terminated_length": 826.9, "completions/mean_length": 642.5625, "completions/mean_terminated_length": 637.825, "completions/min_length": 522.1, "completions/min_terminated_length": 522.1, "entropy": 0.39782516546547414, "epoch": 1.12, "frac_reward_zero_std": 0.875, "grad_norm": 0.9375, "kl": 0.08095719190314413, "learning_rate": 1.3125000000000001e-06, "loss": 0.0059084448963403705, "num_tokens": 979018.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/JointRewardFunction/mean": 0.9375, "rewards/JointRewardFunction/std": 0.1767766922712326, "step": 140, "step_time": 35.47339765110009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 780.2, "completions/max_terminated_length": 709.4, "completions/mean_length": 608.125, "completions/mean_terminated_length": 597.3017883300781, "completions/min_length": 514.2, "completions/min_terminated_length": 514.2, "entropy": 0.3825716434046626, "epoch": 1.2, "frac_reward_zero_std": 0.9, "grad_norm": 0.0263671875, "kl": 0.0812916701193899, "learning_rate": 6.875000000000001e-07, "loss": 0.014202636480331422, "num_tokens": 1048244.0, "reward": 0.95, "reward_std": 0.11700168251991272, "rewards/JointRewardFunction/mean": 0.95, "rewards/JointRewardFunction/std": 0.11700168251991272, "step": 150, "step_time": 33.213279404800595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 785.1, "completions/max_terminated_length": 743.9, "completions/mean_length": 621.9, "completions/mean_terminated_length": 612.2803649902344, "completions/min_length": 508.9, "completions/min_terminated_length": 508.9, "entropy": 0.4208029452711344, "epoch": 1.28, "frac_reward_zero_std": 0.95, "grad_norm": 0.92578125, "kl": 0.08554110652767122, "learning_rate": 6.250000000000001e-08, "loss": 0.011326169967651368, "num_tokens": 1118898.0, "reward": 0.975, "reward_std": 0.07071067690849304, "rewards/JointRewardFunction/mean": 0.975, "rewards/JointRewardFunction/std": 0.07071067690849304, "step": 160, "step_time": 33.45168144740146 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 1118898, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }