483 lines
16 KiB
JSON
483 lines
16 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.28,
|
|
"eval_steps": 500,
|
|
"global_step": 160,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05,
|
|
"completions/max_length": 894.8,
|
|
"completions/max_terminated_length": 821.2,
|
|
"completions/mean_length": 560.2,
|
|
"completions/mean_terminated_length": 540.6345306396485,
|
|
"completions/min_length": 279.5,
|
|
"completions/min_terminated_length": 279.5,
|
|
"entropy": 0.38268125932663677,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 1.4375,
|
|
"kl": 0.07789193278222228,
|
|
"learning_rate": 9.4375e-06,
|
|
"loss": -0.007836591452360153,
|
|
"num_tokens": 65460.0,
|
|
"reward": 0.65,
|
|
"reward_std": 0.46797851026058196,
|
|
"rewards/JointRewardFunction/mean": 0.65,
|
|
"rewards/JointRewardFunction/std": 0.4679785281419754,
|
|
"step": 10,
|
|
"step_time": 36.23508502000004
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.075,
|
|
"completions/max_length": 900.1,
|
|
"completions/max_terminated_length": 823.7,
|
|
"completions/mean_length": 578.525,
|
|
"completions/mean_terminated_length": 543.8116729736328,
|
|
"completions/min_length": 254.6,
|
|
"completions/min_terminated_length": 254.6,
|
|
"entropy": 0.23411482032388448,
|
|
"epoch": 0.16,
|
|
"frac_reward_zero_std": 0.775,
|
|
"grad_norm": 1.1484375,
|
|
"kl": 0.1648747116792947,
|
|
"learning_rate": 8.8125e-06,
|
|
"loss": 0.0052785202860832214,
|
|
"num_tokens": 132386.0,
|
|
"reward": 0.6625,
|
|
"reward_std": 0.46628117859363555,
|
|
"rewards/JointRewardFunction/mean": 0.6625,
|
|
"rewards/JointRewardFunction/std": 0.4662812024354935,
|
|
"step": 20,
|
|
"step_time": 37.938748809599566
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 876.9,
|
|
"completions/max_terminated_length": 829.5,
|
|
"completions/mean_length": 622.2,
|
|
"completions/mean_terminated_length": 611.7642944335937,
|
|
"completions/min_length": 477.6,
|
|
"completions/min_terminated_length": 477.6,
|
|
"entropy": 0.22628286899998784,
|
|
"epoch": 0.24,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 0.0218505859375,
|
|
"kl": 0.12199527090415359,
|
|
"learning_rate": 8.1875e-06,
|
|
"loss": 0.007959160953760147,
|
|
"num_tokens": 202606.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.12246559262275696,
|
|
"rewards/JointRewardFunction/mean": 0.9375,
|
|
"rewards/JointRewardFunction/std": 0.12246559858322144,
|
|
"step": 30,
|
|
"step_time": 37.057444848399975
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 810.6,
|
|
"completions/max_terminated_length": 731.4,
|
|
"completions/mean_length": 596.8625,
|
|
"completions/mean_terminated_length": 584.707144165039,
|
|
"completions/min_length": 438.4,
|
|
"completions/min_terminated_length": 438.4,
|
|
"entropy": 0.42070485297590493,
|
|
"epoch": 0.32,
|
|
"frac_reward_zero_std": 0.85,
|
|
"grad_norm": 1.1015625,
|
|
"kl": 0.10627949037589132,
|
|
"learning_rate": 7.5625e-06,
|
|
"loss": -0.00913204848766327,
|
|
"num_tokens": 270929.0,
|
|
"reward": 0.9,
|
|
"reward_std": 0.20411193668842315,
|
|
"rewards/JointRewardFunction/mean": 0.9,
|
|
"rewards/JointRewardFunction/std": 0.20411194264888763,
|
|
"step": 40,
|
|
"step_time": 34.16536340820039
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05,
|
|
"completions/max_length": 916.6,
|
|
"completions/max_terminated_length": 800.7,
|
|
"completions/mean_length": 642.275,
|
|
"completions/mean_terminated_length": 620.6982238769531,
|
|
"completions/min_length": 483.4,
|
|
"completions/min_terminated_length": 483.4,
|
|
"entropy": 0.545534435659647,
|
|
"epoch": 0.4,
|
|
"frac_reward_zero_std": 0.825,
|
|
"grad_norm": 0.9921875,
|
|
"kl": 0.09743389897048474,
|
|
"learning_rate": 6.9375e-06,
|
|
"loss": 0.019783291220664977,
|
|
"num_tokens": 343141.0,
|
|
"reward": 0.9125,
|
|
"reward_std": 0.19864802658557892,
|
|
"rewards/JointRewardFunction/mean": 0.9125,
|
|
"rewards/JointRewardFunction/std": 0.19864802658557892,
|
|
"step": 50,
|
|
"step_time": 38.55176728389906
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 774.9,
|
|
"completions/max_terminated_length": 774.9,
|
|
"completions/mean_length": 599.525,
|
|
"completions/mean_terminated_length": 599.525,
|
|
"completions/min_length": 438.8,
|
|
"completions/min_terminated_length": 438.8,
|
|
"entropy": 0.5808290097862482,
|
|
"epoch": 0.48,
|
|
"frac_reward_zero_std": 0.925,
|
|
"grad_norm": 0.035400390625,
|
|
"kl": 0.10702053690329194,
|
|
"learning_rate": 6.3125e-06,
|
|
"loss": -0.009540864825248718,
|
|
"num_tokens": 411443.0,
|
|
"reward": 0.9625,
|
|
"reward_std": 0.10606601536273956,
|
|
"rewards/JointRewardFunction/mean": 0.9625,
|
|
"rewards/JointRewardFunction/std": 0.10606601536273956,
|
|
"step": 60,
|
|
"step_time": 33.111361246699744
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0375,
|
|
"completions/max_length": 848.1,
|
|
"completions/max_terminated_length": 757.4,
|
|
"completions/mean_length": 626.325,
|
|
"completions/mean_terminated_length": 609.8857238769531,
|
|
"completions/min_length": 511.5,
|
|
"completions/min_terminated_length": 511.5,
|
|
"entropy": 0.4436331996694207,
|
|
"epoch": 0.56,
|
|
"frac_reward_zero_std": 0.9,
|
|
"grad_norm": 0.060791015625,
|
|
"kl": 0.08952742610126734,
|
|
"learning_rate": 5.6875e-06,
|
|
"loss": 0.014259077608585358,
|
|
"num_tokens": 482005.0,
|
|
"reward": 0.95,
|
|
"reward_std": 0.11700168251991272,
|
|
"rewards/JointRewardFunction/mean": 0.95,
|
|
"rewards/JointRewardFunction/std": 0.11700168251991272,
|
|
"step": 70,
|
|
"step_time": 35.991650615099935
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 787.7,
|
|
"completions/max_terminated_length": 747.8,
|
|
"completions/mean_length": 593.25,
|
|
"completions/mean_terminated_length": 587.5642883300782,
|
|
"completions/min_length": 493.6,
|
|
"completions/min_terminated_length": 493.6,
|
|
"entropy": 0.328166064620018,
|
|
"epoch": 0.64,
|
|
"frac_reward_zero_std": 0.95,
|
|
"grad_norm": 0.017822265625,
|
|
"kl": 0.1307119549252093,
|
|
"learning_rate": 5.0625e-06,
|
|
"loss": 0.0031396135687828063,
|
|
"num_tokens": 550079.0,
|
|
"reward": 0.975,
|
|
"reward_std": 0.07071067690849304,
|
|
"rewards/JointRewardFunction/mean": 0.975,
|
|
"rewards/JointRewardFunction/std": 0.07071067690849304,
|
|
"step": 80,
|
|
"step_time": 33.55307105900029
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 788.4,
|
|
"completions/max_terminated_length": 788.4,
|
|
"completions/mean_length": 619.825,
|
|
"completions/mean_terminated_length": 619.825,
|
|
"completions/min_length": 510.4,
|
|
"completions/min_terminated_length": 510.4,
|
|
"entropy": 0.4017201948910952,
|
|
"epoch": 0.72,
|
|
"frac_reward_zero_std": 0.925,
|
|
"grad_norm": 0.0172119140625,
|
|
"kl": 0.08959094756282866,
|
|
"learning_rate": 4.4375e-06,
|
|
"loss": 0.001467562187463045,
|
|
"num_tokens": 620285.0,
|
|
"reward": 0.9625,
|
|
"reward_std": 0.10606601536273956,
|
|
"rewards/JointRewardFunction/mean": 0.9625,
|
|
"rewards/JointRewardFunction/std": 0.10606601536273956,
|
|
"step": 90,
|
|
"step_time": 33.41816141909967
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05,
|
|
"completions/max_length": 911.0,
|
|
"completions/max_terminated_length": 831.5,
|
|
"completions/mean_length": 671.025,
|
|
"completions/mean_terminated_length": 653.7214416503906,
|
|
"completions/min_length": 537.5,
|
|
"completions/min_terminated_length": 537.5,
|
|
"entropy": 0.4100338226184249,
|
|
"epoch": 0.8,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 0.021240234375,
|
|
"kl": 0.07856191159226,
|
|
"learning_rate": 3.8125e-06,
|
|
"loss": 0.017280958592891693,
|
|
"num_tokens": 694633.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1767766922712326,
|
|
"rewards/JointRewardFunction/mean": 0.9375,
|
|
"rewards/JointRewardFunction/std": 0.1767766922712326,
|
|
"step": 100,
|
|
"step_time": 38.310592102500415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 867.4,
|
|
"completions/max_terminated_length": 779.6,
|
|
"completions/mean_length": 642.0875,
|
|
"completions/mean_terminated_length": 616.8726257324219,
|
|
"completions/min_length": 501.5,
|
|
"completions/min_terminated_length": 501.5,
|
|
"entropy": 0.40164962466806176,
|
|
"epoch": 0.88,
|
|
"frac_reward_zero_std": 0.925,
|
|
"grad_norm": 1.1328125,
|
|
"kl": 0.0828359558712691,
|
|
"learning_rate": 3.1875e-06,
|
|
"loss": 0.009746464341878891,
|
|
"num_tokens": 766544.0,
|
|
"reward": 0.9625,
|
|
"reward_std": 0.0816463440656662,
|
|
"rewards/JointRewardFunction/mean": 0.9625,
|
|
"rewards/JointRewardFunction/std": 0.0816463440656662,
|
|
"step": 110,
|
|
"step_time": 36.619024862399236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0375,
|
|
"completions/max_length": 807.3,
|
|
"completions/max_terminated_length": 713.2,
|
|
"completions/mean_length": 611.7875,
|
|
"completions/mean_terminated_length": 595.7928649902344,
|
|
"completions/min_length": 508.7,
|
|
"completions/min_terminated_length": 508.7,
|
|
"entropy": 0.40618473663926125,
|
|
"epoch": 0.96,
|
|
"frac_reward_zero_std": 0.925,
|
|
"grad_norm": 1.03125,
|
|
"kl": 0.08407443668693304,
|
|
"learning_rate": 2.5625e-06,
|
|
"loss": 0.01084473505616188,
|
|
"num_tokens": 835859.0,
|
|
"reward": 0.9625,
|
|
"reward_std": 0.10606601536273956,
|
|
"rewards/JointRewardFunction/mean": 0.9625,
|
|
"rewards/JointRewardFunction/std": 0.10606601536273956,
|
|
"step": 120,
|
|
"step_time": 34.16594214020042
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 751.9,
|
|
"completions/max_terminated_length": 748.3,
|
|
"completions/mean_length": 631.1,
|
|
"completions/mean_terminated_length": 626.9517883300781,
|
|
"completions/min_length": 544.1,
|
|
"completions/min_terminated_length": 544.1,
|
|
"entropy": 0.42196682561188936,
|
|
"epoch": 1.04,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.041748046875,
|
|
"kl": 0.08644672441296279,
|
|
"learning_rate": 1.9375e-06,
|
|
"loss": 0.00017178469570353628,
|
|
"num_tokens": 906965.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/JointRewardFunction/mean": 1.0,
|
|
"rewards/JointRewardFunction/std": 0.0,
|
|
"step": 130,
|
|
"step_time": 31.96281487729957
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 837.6,
|
|
"completions/max_terminated_length": 826.9,
|
|
"completions/mean_length": 642.5625,
|
|
"completions/mean_terminated_length": 637.825,
|
|
"completions/min_length": 522.1,
|
|
"completions/min_terminated_length": 522.1,
|
|
"entropy": 0.39782516546547414,
|
|
"epoch": 1.12,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 0.9375,
|
|
"kl": 0.08095719190314413,
|
|
"learning_rate": 1.3125000000000001e-06,
|
|
"loss": 0.0059084448963403705,
|
|
"num_tokens": 979018.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1767766922712326,
|
|
"rewards/JointRewardFunction/mean": 0.9375,
|
|
"rewards/JointRewardFunction/std": 0.1767766922712326,
|
|
"step": 140,
|
|
"step_time": 35.47339765110009
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 780.2,
|
|
"completions/max_terminated_length": 709.4,
|
|
"completions/mean_length": 608.125,
|
|
"completions/mean_terminated_length": 597.3017883300781,
|
|
"completions/min_length": 514.2,
|
|
"completions/min_terminated_length": 514.2,
|
|
"entropy": 0.3825716434046626,
|
|
"epoch": 1.2,
|
|
"frac_reward_zero_std": 0.9,
|
|
"grad_norm": 0.0263671875,
|
|
"kl": 0.0812916701193899,
|
|
"learning_rate": 6.875000000000001e-07,
|
|
"loss": 0.014202636480331422,
|
|
"num_tokens": 1048244.0,
|
|
"reward": 0.95,
|
|
"reward_std": 0.11700168251991272,
|
|
"rewards/JointRewardFunction/mean": 0.95,
|
|
"rewards/JointRewardFunction/std": 0.11700168251991272,
|
|
"step": 150,
|
|
"step_time": 33.213279404800595
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 785.1,
|
|
"completions/max_terminated_length": 743.9,
|
|
"completions/mean_length": 621.9,
|
|
"completions/mean_terminated_length": 612.2803649902344,
|
|
"completions/min_length": 508.9,
|
|
"completions/min_terminated_length": 508.9,
|
|
"entropy": 0.4208029452711344,
|
|
"epoch": 1.28,
|
|
"frac_reward_zero_std": 0.95,
|
|
"grad_norm": 0.92578125,
|
|
"kl": 0.08554110652767122,
|
|
"learning_rate": 6.250000000000001e-08,
|
|
"loss": 0.011326169967651368,
|
|
"num_tokens": 1118898.0,
|
|
"reward": 0.975,
|
|
"reward_std": 0.07071067690849304,
|
|
"rewards/JointRewardFunction/mean": 0.975,
|
|
"rewards/JointRewardFunction/std": 0.07071067690849304,
|
|
"step": 160,
|
|
"step_time": 33.45168144740146
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 160,
|
|
"num_input_tokens_seen": 1118898,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 10,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|