1875 lines
69 KiB
JSON
1875 lines
69 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.9984,
|
||
|
|
"eval_steps": 50,
|
||
|
|
"global_step": 312,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0400390625,
|
||
|
|
"completions/max_length": 1536.0,
|
||
|
|
"completions/max_terminated_length": 1521.8,
|
||
|
|
"completions/mean_length": 276.6455078125,
|
||
|
|
"completions/mean_terminated_length": 224.1190185546875,
|
||
|
|
"completions/min_length": 2.0,
|
||
|
|
"completions/min_terminated_length": 2.0,
|
||
|
|
"epoch": 0.016,
|
||
|
|
"grad_norm": 0.05407445505261421,
|
||
|
|
"learning_rate": 3.1249999999999997e-07,
|
||
|
|
"loss": 0.0956,
|
||
|
|
"num_tokens": 17676882.0,
|
||
|
|
"reward": 0.6326022028923035,
|
||
|
|
"reward_std": 0.4947403073310852,
|
||
|
|
"rewards/accuracy_reward": 0.2208984375,
|
||
|
|
"rewards/brier_reward": 0.3710617899894714,
|
||
|
|
"rewards/confidence_one_or_zero": 0.27548828125,
|
||
|
|
"rewards/format_reward": 0.6732421875,
|
||
|
|
"rewards/mean_confidence_reward": 0.7399574875831604,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.034765625,
|
||
|
|
"completions/max_length": 1536.0,
|
||
|
|
"completions/max_terminated_length": 1498.2,
|
||
|
|
"completions/mean_length": 258.75498046875,
|
||
|
|
"completions/mean_terminated_length": 212.7857635498047,
|
||
|
|
"completions/min_length": 2.0,
|
||
|
|
"completions/min_terminated_length": 2.0,
|
||
|
|
"epoch": 0.032,
|
||
|
|
"grad_norm": 0.0354408323764801,
|
||
|
|
"learning_rate": 6.249999999999999e-07,
|
||
|
|
"loss": 0.0885,
|
||
|
|
"num_tokens": 35426885.0,
|
||
|
|
"reward": 0.6595678806304932,
|
||
|
|
"reward_std": 0.46407333612442014,
|
||
|
|
"rewards/accuracy_reward": 0.21484375,
|
||
|
|
"rewards/brier_reward": 0.38378217816352844,
|
||
|
|
"rewards/confidence_one_or_zero": 0.26513671875,
|
||
|
|
"rewards/format_reward": 0.7205078125,
|
||
|
|
"rewards/mean_confidence_reward": 0.7485471248626709,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0203125,
|
||
|
|
"completions/max_length": 1536.0,
|
||
|
|
"completions/max_terminated_length": 1443.6,
|
||
|
|
"completions/mean_length": 203.36826171875,
|
||
|
|
"completions/mean_terminated_length": 175.87453918457032,
|
||
|
|
"completions/min_length": 1.8,
|
||
|
|
"completions/min_terminated_length": 1.8,
|
||
|
|
"epoch": 0.048,
|
||
|
|
"grad_norm": 0.030709726735949516,
|
||
|
|
"learning_rate": 9.374999999999999e-07,
|
||
|
|
"loss": 0.0683,
|
||
|
|
"num_tokens": 52558112.0,
|
||
|
|
"reward": 0.8185818791389465,
|
||
|
|
"reward_std": 0.37540732622146605,
|
||
|
|
"rewards/accuracy_reward": 0.2767578125,
|
||
|
|
"rewards/brier_reward": 0.4851109802722931,
|
||
|
|
"rewards/confidence_one_or_zero": 0.26318359375,
|
||
|
|
"rewards/format_reward": 0.87529296875,
|
||
|
|
"rewards/mean_confidence_reward": 0.763912582397461,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0048828125,
|
||
|
|
"completions/max_length": 1536.0,
|
||
|
|
"completions/max_terminated_length": 1206.6,
|
||
|
|
"completions/mean_length": 136.7234375,
|
||
|
|
"completions/mean_terminated_length": 129.86282653808593,
|
||
|
|
"completions/min_length": 7.4,
|
||
|
|
"completions/min_terminated_length": 7.4,
|
||
|
|
"epoch": 0.064,
|
||
|
|
"grad_norm": 0.030825674533843994,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0144,
|
||
|
|
"num_tokens": 68876560.0,
|
||
|
|
"reward": 0.944485855102539,
|
||
|
|
"reward_std": 0.2918001413345337,
|
||
|
|
"rewards/accuracy_reward": 0.336328125,
|
||
|
|
"rewards/brier_reward": 0.5843800187110901,
|
||
|
|
"rewards/confidence_one_or_zero": 0.20009765625,
|
||
|
|
"rewards/format_reward": 0.96826171875,
|
||
|
|
"rewards/mean_confidence_reward": 0.7416761994361878,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.00166015625,
|
||
|
|
"completions/max_length": 1536.0,
|
||
|
|
"completions/max_terminated_length": 858.0,
|
||
|
|
"completions/mean_length": 121.93955078125,
|
||
|
|
"completions/mean_terminated_length": 119.58839721679688,
|
||
|
|
"completions/min_length": 16.0,
|
||
|
|
"completions/min_terminated_length": 16.0,
|
||
|
|
"epoch": 0.08,
|
||
|
|
"grad_norm": 0.07219453901052475,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"num_tokens": 85058373.0,
|
||
|
|
"reward": 1.0135140299797059,
|
||
|
|
"reward_std": 0.22123381197452546,
|
||
|
|
"rewards/accuracy_reward": 0.3626953125,
|
||
|
|
"rewards/brier_reward": 0.6754642128944397,
|
||
|
|
"rewards/confidence_one_or_zero": 0.083203125,
|
||
|
|
"rewards/format_reward": 0.9888671875,
|
||
|
|
"rewards/mean_confidence_reward": 0.6404195070266724,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.000390625,
|
||
|
|
"completions/max_length": 1187.0,
|
||
|
|
"completions/max_terminated_length": 636.0,
|
||
|
|
"completions/mean_length": 122.44267578125,
|
||
|
|
"completions/mean_terminated_length": 121.89026336669922,
|
||
|
|
"completions/min_length": 40.0,
|
||
|
|
"completions/min_terminated_length": 40.0,
|
||
|
|
"epoch": 0.096,
|
||
|
|
"grad_norm": 0.0071799191646277905,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0011,
|
||
|
|
"num_tokens": 101356794.0,
|
||
|
|
"reward": 1.050560426712036,
|
||
|
|
"reward_std": 0.16749218702316285,
|
||
|
|
"rewards/accuracy_reward": 0.36787109375,
|
||
|
|
"rewards/brier_reward": 0.7357877850532532,
|
||
|
|
"rewards/confidence_one_or_zero": 0.04443359375,
|
||
|
|
"rewards/format_reward": 0.9974609375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5114866554737091,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0005859375,
|
||
|
|
"completions/max_length": 822.8,
|
||
|
|
"completions/max_terminated_length": 457.6,
|
||
|
|
"completions/mean_length": 125.12060546875,
|
||
|
|
"completions/mean_terminated_length": 124.29327087402343,
|
||
|
|
"completions/min_length": 40.8,
|
||
|
|
"completions/min_terminated_length": 40.8,
|
||
|
|
"epoch": 0.112,
|
||
|
|
"grad_norm": 0.012783159501850605,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 117747501.0,
|
||
|
|
"reward": 1.0780974626541138,
|
||
|
|
"reward_std": 0.12303584218025207,
|
||
|
|
"rewards/accuracy_reward": 0.40283203125,
|
||
|
|
"rewards/brier_reward": 0.7566823959350586,
|
||
|
|
"rewards/confidence_one_or_zero": 0.05263671875,
|
||
|
|
"rewards/format_reward": 0.9966796875,
|
||
|
|
"rewards/mean_confidence_reward": 0.3563989281654358,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.000390625,
|
||
|
|
"completions/max_length": 1302.2,
|
||
|
|
"completions/max_terminated_length": 380.2,
|
||
|
|
"completions/mean_length": 131.248828125,
|
||
|
|
"completions/mean_terminated_length": 130.69945831298827,
|
||
|
|
"completions/min_length": 44.2,
|
||
|
|
"completions/min_terminated_length": 44.2,
|
||
|
|
"epoch": 0.128,
|
||
|
|
"grad_norm": 0.03782346472144127,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_tokens": 134008161.0,
|
||
|
|
"reward": 1.0803744792938232,
|
||
|
|
"reward_std": 0.09947807043790817,
|
||
|
|
"rewards/accuracy_reward": 0.40439453125,
|
||
|
|
"rewards/brier_reward": 0.7587952256202698,
|
||
|
|
"rewards/confidence_one_or_zero": 0.04990234375,
|
||
|
|
"rewards/format_reward": 0.99755859375,
|
||
|
|
"rewards/mean_confidence_reward": 0.30792068839073183,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 767.4,
|
||
|
|
"completions/max_terminated_length": 526.6,
|
||
|
|
"completions/mean_length": 134.51650390625,
|
||
|
|
"completions/mean_terminated_length": 134.37986755371094,
|
||
|
|
"completions/min_length": 29.0,
|
||
|
|
"completions/min_terminated_length": 29.0,
|
||
|
|
"epoch": 0.144,
|
||
|
|
"grad_norm": 0.0044364649802446365,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0005,
|
||
|
|
"num_tokens": 150336042.0,
|
||
|
|
"reward": 1.1184379577636718,
|
||
|
|
"reward_std": 0.10152655839920044,
|
||
|
|
"rewards/accuracy_reward": 0.5048828125,
|
||
|
|
"rewards/brier_reward": 0.7332618951797485,
|
||
|
|
"rewards/confidence_one_or_zero": 0.03076171875,
|
||
|
|
"rewards/format_reward": 0.99873046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.33715721368789675,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 596.4,
|
||
|
|
"completions/max_terminated_length": 347.8,
|
||
|
|
"completions/mean_length": 141.1640625,
|
||
|
|
"completions/mean_terminated_length": 141.02822570800782,
|
||
|
|
"completions/min_length": 53.6,
|
||
|
|
"completions/min_terminated_length": 53.6,
|
||
|
|
"epoch": 0.16,
|
||
|
|
"grad_norm": 0.0031058751046657562,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0009,
|
||
|
|
"num_tokens": 166802490.0,
|
||
|
|
"reward": 1.1045508146286012,
|
||
|
|
"reward_std": 0.10970858335494996,
|
||
|
|
"rewards/accuracy_reward": 0.4513671875,
|
||
|
|
"rewards/brier_reward": 0.7593937039375305,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01396484375,
|
||
|
|
"rewards/format_reward": 0.99833984375,
|
||
|
|
"rewards/mean_confidence_reward": 0.4095295906066895,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 249.75,
|
||
|
|
"eval_completions/max_terminated_length": 249.75,
|
||
|
|
"eval_completions/mean_length": 143.74979782104492,
|
||
|
|
"eval_completions/mean_terminated_length": 143.74979782104492,
|
||
|
|
"eval_completions/min_length": 72.75,
|
||
|
|
"eval_completions/min_terminated_length": 72.75,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 166802490.0,
|
||
|
|
"eval_reward": 1.0683082938194275,
|
||
|
|
"eval_reward_std": 0.22595830261707306,
|
||
|
|
"eval_rewards/accuracy_reward": 0.3671875,
|
||
|
|
"eval_rewards/brier_reward": 0.7694281339645386,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.0078125,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.4460156336426735,
|
||
|
|
"eval_runtime": 17.2028,
|
||
|
|
"eval_samples_per_second": 29.065,
|
||
|
|
"eval_steps_per_second": 0.233,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 356.0,
|
||
|
|
"completions/max_terminated_length": 356.0,
|
||
|
|
"completions/mean_length": 147.52265625,
|
||
|
|
"completions/mean_terminated_length": 147.52265625,
|
||
|
|
"completions/min_length": 56.2,
|
||
|
|
"completions/min_terminated_length": 56.2,
|
||
|
|
"epoch": 0.176,
|
||
|
|
"grad_norm": 0.008834286592900753,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0005,
|
||
|
|
"num_tokens": 183550242.0,
|
||
|
|
"reward": 1.105972409248352,
|
||
|
|
"reward_std": 0.10724246203899383,
|
||
|
|
"rewards/accuracy_reward": 0.44873046875,
|
||
|
|
"rewards/brier_reward": 0.7641900300979614,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01337890625,
|
||
|
|
"rewards/format_reward": 0.9990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.4539414048194885,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 342.0,
|
||
|
|
"completions/max_terminated_length": 342.0,
|
||
|
|
"completions/mean_length": 155.03759765625,
|
||
|
|
"completions/mean_terminated_length": 155.03759765625,
|
||
|
|
"completions/min_length": 68.2,
|
||
|
|
"completions/min_terminated_length": 68.2,
|
||
|
|
"epoch": 0.192,
|
||
|
|
"grad_norm": 0.0015183566138148308,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0001,
|
||
|
|
"num_tokens": 199952643.0,
|
||
|
|
"reward": 1.1174006700515746,
|
||
|
|
"reward_std": 0.10823124945163727,
|
||
|
|
"rewards/accuracy_reward": 0.4720703125,
|
||
|
|
"rewards/brier_reward": 0.7630230784416199,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00830078125,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.4763046860694885,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 429.8,
|
||
|
|
"completions/max_terminated_length": 429.8,
|
||
|
|
"completions/mean_length": 164.254296875,
|
||
|
|
"completions/mean_terminated_length": 164.254296875,
|
||
|
|
"completions/min_length": 82.2,
|
||
|
|
"completions/min_terminated_length": 82.2,
|
||
|
|
"epoch": 0.208,
|
||
|
|
"grad_norm": 0.0033636174630373716,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 216666831.0,
|
||
|
|
"reward": 1.144743847846985,
|
||
|
|
"reward_std": 0.11080079525709152,
|
||
|
|
"rewards/accuracy_reward": 0.52763671875,
|
||
|
|
"rewards/brier_reward": 0.7621429681777954,
|
||
|
|
"rewards/confidence_one_or_zero": 0.008203125,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.48511279225349424,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 674.8,
|
||
|
|
"completions/max_terminated_length": 462.6,
|
||
|
|
"completions/mean_length": 168.61201171875,
|
||
|
|
"completions/mean_terminated_length": 168.47879638671876,
|
||
|
|
"completions/min_length": 77.4,
|
||
|
|
"completions/min_terminated_length": 77.4,
|
||
|
|
"epoch": 0.224,
|
||
|
|
"grad_norm": 0.0016244107391685247,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_tokens": 233546602.0,
|
||
|
|
"reward": 1.1177247285842895,
|
||
|
|
"reward_std": 0.10474657416343688,
|
||
|
|
"rewards/accuracy_reward": 0.46884765625,
|
||
|
|
"rewards/brier_reward": 0.7668938279151917,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00947265625,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.4841289222240448,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 651.0,
|
||
|
|
"completions/max_terminated_length": 417.8,
|
||
|
|
"completions/mean_length": 174.24892578125,
|
||
|
|
"completions/mean_terminated_length": 174.11595458984374,
|
||
|
|
"completions/min_length": 57.0,
|
||
|
|
"completions/min_terminated_length": 57.0,
|
||
|
|
"epoch": 0.24,
|
||
|
|
"grad_norm": 0.002639307640492916,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 250582591.0,
|
||
|
|
"reward": 1.1502854824066162,
|
||
|
|
"reward_std": 0.12000200897455215,
|
||
|
|
"rewards/accuracy_reward": 0.53818359375,
|
||
|
|
"rewards/brier_reward": 0.7634606242179871,
|
||
|
|
"rewards/confidence_one_or_zero": 0.005078125,
|
||
|
|
"rewards/format_reward": 0.99892578125,
|
||
|
|
"rewards/mean_confidence_reward": 0.489949232339859,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.000390625,
|
||
|
|
"completions/max_length": 843.8,
|
||
|
|
"completions/max_terminated_length": 765.6,
|
||
|
|
"completions/mean_length": 175.550390625,
|
||
|
|
"completions/mean_terminated_length": 175.02010498046874,
|
||
|
|
"completions/min_length": 81.6,
|
||
|
|
"completions/min_terminated_length": 81.6,
|
||
|
|
"epoch": 0.256,
|
||
|
|
"grad_norm": 0.02150336280465126,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"num_tokens": 267435043.0,
|
||
|
|
"reward": 1.1393208265304566,
|
||
|
|
"reward_std": 0.11671655029058456,
|
||
|
|
"rewards/accuracy_reward": 0.5083984375,
|
||
|
|
"rewards/brier_reward": 0.7709258198738098,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01162109375,
|
||
|
|
"rewards/format_reward": 0.99931640625,
|
||
|
|
"rewards/mean_confidence_reward": 0.48744922280311587,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 492.0,
|
||
|
|
"completions/max_terminated_length": 492.0,
|
||
|
|
"completions/mean_length": 183.68974609375,
|
||
|
|
"completions/mean_terminated_length": 183.68974609375,
|
||
|
|
"completions/min_length": 79.4,
|
||
|
|
"completions/min_terminated_length": 79.4,
|
||
|
|
"epoch": 0.272,
|
||
|
|
"grad_norm": 0.0036507430486381054,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_tokens": 284281722.0,
|
||
|
|
"reward": 1.1324650287628173,
|
||
|
|
"reward_std": 0.11320338100194931,
|
||
|
|
"rewards/accuracy_reward": 0.4955078125,
|
||
|
|
"rewards/brier_reward": 0.7696167230606079,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01044921875,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.48797558546066283,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 675.0,
|
||
|
|
"completions/max_terminated_length": 453.2,
|
||
|
|
"completions/mean_length": 183.3435546875,
|
||
|
|
"completions/mean_terminated_length": 183.211669921875,
|
||
|
|
"completions/min_length": 71.6,
|
||
|
|
"completions/min_terminated_length": 71.6,
|
||
|
|
"epoch": 0.288,
|
||
|
|
"grad_norm": 0.00159507489297539,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_tokens": 301117336.0,
|
||
|
|
"reward": 1.1380449295043946,
|
||
|
|
"reward_std": 0.11923972368240357,
|
||
|
|
"rewards/accuracy_reward": 0.50419921875,
|
||
|
|
"rewards/brier_reward": 0.7722803950309753,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00966796875,
|
||
|
|
"rewards/format_reward": 0.999609375,
|
||
|
|
"rewards/mean_confidence_reward": 0.4882441520690918,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 729.4,
|
||
|
|
"completions/max_terminated_length": 545.0,
|
||
|
|
"completions/mean_length": 188.354296875,
|
||
|
|
"completions/mean_terminated_length": 188.22288818359374,
|
||
|
|
"completions/min_length": 74.2,
|
||
|
|
"completions/min_terminated_length": 74.2,
|
||
|
|
"epoch": 0.304,
|
||
|
|
"grad_norm": 0.0020240871235728264,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 317976036.0,
|
||
|
|
"reward": 1.1388062238693237,
|
||
|
|
"reward_std": 0.11345189213752746,
|
||
|
|
"rewards/accuracy_reward": 0.50478515625,
|
||
|
|
"rewards/brier_reward": 0.7739005923271179,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01728515625,
|
||
|
|
"rewards/format_reward": 0.99892578125,
|
||
|
|
"rewards/mean_confidence_reward": 0.48295703530311584,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 662.4,
|
||
|
|
"completions/max_terminated_length": 447.6,
|
||
|
|
"completions/mean_length": 191.061328125,
|
||
|
|
"completions/mean_terminated_length": 190.9298309326172,
|
||
|
|
"completions/min_length": 86.8,
|
||
|
|
"completions/min_terminated_length": 86.8,
|
||
|
|
"epoch": 0.32,
|
||
|
|
"grad_norm": 0.0013722889125347137,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_tokens": 335021208.0,
|
||
|
|
"reward": 1.1509589195251464,
|
||
|
|
"reward_std": 0.1035462662577629,
|
||
|
|
"rewards/accuracy_reward": 0.52197265625,
|
||
|
|
"rewards/brier_reward": 0.7800418734550476,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01318359375,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.4988081157207489,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 411.75,
|
||
|
|
"eval_completions/max_terminated_length": 411.75,
|
||
|
|
"eval_completions/mean_length": 195.53663635253906,
|
||
|
|
"eval_completions/mean_terminated_length": 195.53663635253906,
|
||
|
|
"eval_completions/min_length": 107.25,
|
||
|
|
"eval_completions/min_terminated_length": 107.25,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 335021208.0,
|
||
|
|
"eval_reward": 1.0887417793273926,
|
||
|
|
"eval_reward_std": 0.25174758210778236,
|
||
|
|
"eval_rewards/accuracy_reward": 0.400390625,
|
||
|
|
"eval_rewards/brier_reward": 0.7770919799804688,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.013671875,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.46802735328674316,
|
||
|
|
"eval_runtime": 22.2738,
|
||
|
|
"eval_samples_per_second": 22.448,
|
||
|
|
"eval_steps_per_second": 0.18,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.000390625,
|
||
|
|
"completions/max_length": 1125.0,
|
||
|
|
"completions/max_terminated_length": 479.8,
|
||
|
|
"completions/mean_length": 196.13671875,
|
||
|
|
"completions/mean_terminated_length": 195.61272583007812,
|
||
|
|
"completions/min_length": 92.0,
|
||
|
|
"completions/min_terminated_length": 92.0,
|
||
|
|
"epoch": 0.336,
|
||
|
|
"grad_norm": 0.0013452547136694193,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"num_tokens": 351752080.0,
|
||
|
|
"reward": 1.1555601119995118,
|
||
|
|
"reward_std": 0.11393154710531235,
|
||
|
|
"rewards/accuracy_reward": 0.52939453125,
|
||
|
|
"rewards/brier_reward": 0.7823106408119201,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01396484375,
|
||
|
|
"rewards/format_reward": 0.9994140625,
|
||
|
|
"rewards/mean_confidence_reward": 0.49622313380241395,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 908.0,
|
||
|
|
"completions/max_terminated_length": 493.8,
|
||
|
|
"completions/mean_length": 199.18369140625,
|
||
|
|
"completions/mean_terminated_length": 198.923095703125,
|
||
|
|
"completions/min_length": 97.6,
|
||
|
|
"completions/min_terminated_length": 97.6,
|
||
|
|
"epoch": 0.352,
|
||
|
|
"grad_norm": 0.002751865889877081,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_tokens": 369052137.0,
|
||
|
|
"reward": 1.1255475521087646,
|
||
|
|
"reward_std": 0.11049925088882447,
|
||
|
|
"rewards/accuracy_reward": 0.4666015625,
|
||
|
|
"rewards/brier_reward": 0.7846879482269287,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01533203125,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.4882112622261047,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 627.8,
|
||
|
|
"completions/max_terminated_length": 627.8,
|
||
|
|
"completions/mean_length": 203.8998046875,
|
||
|
|
"completions/mean_terminated_length": 203.8998046875,
|
||
|
|
"completions/min_length": 95.6,
|
||
|
|
"completions/min_terminated_length": 95.6,
|
||
|
|
"epoch": 0.368,
|
||
|
|
"grad_norm": 0.002145805163308978,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_tokens": 386205543.0,
|
||
|
|
"reward": 1.1361007690429688,
|
||
|
|
"reward_std": 0.1053838849067688,
|
||
|
|
"rewards/accuracy_reward": 0.48994140625,
|
||
|
|
"rewards/brier_reward": 0.7822591662406921,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01376953125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5061289191246032,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 659.6,
|
||
|
|
"completions/max_terminated_length": 659.6,
|
||
|
|
"completions/mean_length": 201.56748046875,
|
||
|
|
"completions/mean_terminated_length": 201.56748046875,
|
||
|
|
"completions/min_length": 97.0,
|
||
|
|
"completions/min_terminated_length": 97.0,
|
||
|
|
"epoch": 0.384,
|
||
|
|
"grad_norm": 0.002008226700127125,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 403126106.0,
|
||
|
|
"reward": 1.1584414958953857,
|
||
|
|
"reward_std": 0.10435761213302612,
|
||
|
|
"rewards/accuracy_reward": 0.521484375,
|
||
|
|
"rewards/brier_reward": 0.7953975558280945,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01845703125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5107519447803497,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 486.2,
|
||
|
|
"completions/max_terminated_length": 486.2,
|
||
|
|
"completions/mean_length": 201.58603515625,
|
||
|
|
"completions/mean_terminated_length": 201.58603515625,
|
||
|
|
"completions/min_length": 96.8,
|
||
|
|
"completions/min_terminated_length": 96.8,
|
||
|
|
"epoch": 0.4,
|
||
|
|
"grad_norm": 0.004599791020154953,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 420226795.0,
|
||
|
|
"reward": 1.1391048192977906,
|
||
|
|
"reward_std": 0.11113806515932083,
|
||
|
|
"rewards/accuracy_reward": 0.4962890625,
|
||
|
|
"rewards/brier_reward": 0.781919538974762,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0142578125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5150127053260803,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 573.2,
|
||
|
|
"completions/max_terminated_length": 573.2,
|
||
|
|
"completions/mean_length": 201.14990234375,
|
||
|
|
"completions/mean_terminated_length": 201.14990234375,
|
||
|
|
"completions/min_length": 98.6,
|
||
|
|
"completions/min_terminated_length": 98.6,
|
||
|
|
"epoch": 0.416,
|
||
|
|
"grad_norm": 0.0012156119337305427,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 437167754.0,
|
||
|
|
"reward": 1.150643491744995,
|
||
|
|
"reward_std": 0.1118047833442688,
|
||
|
|
"rewards/accuracy_reward": 0.50810546875,
|
||
|
|
"rewards/brier_reward": 0.7934735059738159,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0142578125,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.4986997008323669,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 674.8,
|
||
|
|
"completions/max_terminated_length": 482.4,
|
||
|
|
"completions/mean_length": 200.10625,
|
||
|
|
"completions/mean_terminated_length": 199.97588195800782,
|
||
|
|
"completions/min_length": 95.4,
|
||
|
|
"completions/min_terminated_length": 95.4,
|
||
|
|
"epoch": 0.432,
|
||
|
|
"grad_norm": 0.006859796121716499,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 454231178.0,
|
||
|
|
"reward": 1.1733263969421386,
|
||
|
|
"reward_std": 0.10346025228500366,
|
||
|
|
"rewards/accuracy_reward": 0.54521484375,
|
||
|
|
"rewards/brier_reward": 0.8016322970390319,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01435546875,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5186787366867065,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 719.8,
|
||
|
|
"completions/max_terminated_length": 518.8,
|
||
|
|
"completions/mean_length": 208.76640625,
|
||
|
|
"completions/mean_terminated_length": 208.63664855957032,
|
||
|
|
"completions/min_length": 96.8,
|
||
|
|
"completions/min_terminated_length": 96.8,
|
||
|
|
"epoch": 0.448,
|
||
|
|
"grad_norm": 0.0016618920490145683,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_tokens": 471321746.0,
|
||
|
|
"reward": 1.1534050226211547,
|
||
|
|
"reward_std": 0.11015735268592834,
|
||
|
|
"rewards/accuracy_reward": 0.51103515625,
|
||
|
|
"rewards/brier_reward": 0.7959691882133484,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0083984375,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5418408274650574,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 566.8,
|
||
|
|
"completions/max_terminated_length": 566.8,
|
||
|
|
"completions/mean_length": 215.3693359375,
|
||
|
|
"completions/mean_terminated_length": 215.3693359375,
|
||
|
|
"completions/min_length": 99.2,
|
||
|
|
"completions/min_terminated_length": 99.2,
|
||
|
|
"epoch": 0.464,
|
||
|
|
"grad_norm": 0.0014355273451656103,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 488697944.0,
|
||
|
|
"reward": 1.1193523406982422,
|
||
|
|
"reward_std": 0.10776209384202957,
|
||
|
|
"rewards/accuracy_reward": 0.458984375,
|
||
|
|
"rewards/brier_reward": 0.7799146294593811,
|
||
|
|
"rewards/confidence_one_or_zero": 0.011328125,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.520960932970047,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 752.6,
|
||
|
|
"completions/max_terminated_length": 530.4,
|
||
|
|
"completions/mean_length": 216.18486328125,
|
||
|
|
"completions/mean_terminated_length": 215.92719421386718,
|
||
|
|
"completions/min_length": 108.8,
|
||
|
|
"completions/min_terminated_length": 108.8,
|
||
|
|
"epoch": 0.48,
|
||
|
|
"grad_norm": 0.0018099879380315542,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_tokens": 505959709.0,
|
||
|
|
"reward": 1.1509707689285278,
|
||
|
|
"reward_std": 0.10748694986104965,
|
||
|
|
"rewards/accuracy_reward": 0.512109375,
|
||
|
|
"rewards/brier_reward": 0.7900263905525208,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01904296875,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.4951982319355011,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 374.5,
|
||
|
|
"eval_completions/max_terminated_length": 374.5,
|
||
|
|
"eval_completions/mean_length": 216.64392471313477,
|
||
|
|
"eval_completions/mean_terminated_length": 216.64392471313477,
|
||
|
|
"eval_completions/min_length": 125.0,
|
||
|
|
"eval_completions/min_terminated_length": 125.0,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 505959709.0,
|
||
|
|
"eval_reward": 1.1084575355052948,
|
||
|
|
"eval_reward_std": 0.2621918395161629,
|
||
|
|
"eval_rewards/accuracy_reward": 0.41015625,
|
||
|
|
"eval_rewards/brier_reward": 0.8067578077316284,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.0234375,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.4730468988418579,
|
||
|
|
"eval_runtime": 21.7939,
|
||
|
|
"eval_samples_per_second": 22.942,
|
||
|
|
"eval_steps_per_second": 0.184,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 588.8,
|
||
|
|
"completions/max_terminated_length": 588.8,
|
||
|
|
"completions/mean_length": 218.93642578125,
|
||
|
|
"completions/mean_terminated_length": 218.93642578125,
|
||
|
|
"completions/min_length": 106.4,
|
||
|
|
"completions/min_terminated_length": 106.4,
|
||
|
|
"epoch": 0.496,
|
||
|
|
"grad_norm": 0.0012288556899875402,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0001,
|
||
|
|
"num_tokens": 523509458.0,
|
||
|
|
"reward": 1.1674549341201783,
|
||
|
|
"reward_std": 0.0982852265238762,
|
||
|
|
"rewards/accuracy_reward": 0.5412109375,
|
||
|
|
"rewards/brier_reward": 0.7936979055404663,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01201171875,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5015585958957672,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 729.2,
|
||
|
|
"completions/max_terminated_length": 520.6,
|
||
|
|
"completions/mean_length": 218.68642578125,
|
||
|
|
"completions/mean_terminated_length": 218.5576599121094,
|
||
|
|
"completions/min_length": 110.6,
|
||
|
|
"completions/min_terminated_length": 110.6,
|
||
|
|
"epoch": 0.512,
|
||
|
|
"grad_norm": 0.0026432271115481853,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_tokens": 540894471.0,
|
||
|
|
"reward": 1.1772954702377318,
|
||
|
|
"reward_std": 0.10269609093666077,
|
||
|
|
"rewards/accuracy_reward": 0.54892578125,
|
||
|
|
"rewards/brier_reward": 0.8057618141174316,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00927734375,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5223603427410126,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.00029296875,
|
||
|
|
"completions/max_length": 1145.4,
|
||
|
|
"completions/max_terminated_length": 564.4,
|
||
|
|
"completions/mean_length": 220.1521484375,
|
||
|
|
"completions/mean_terminated_length": 219.7665252685547,
|
||
|
|
"completions/min_length": 113.2,
|
||
|
|
"completions/min_terminated_length": 113.2,
|
||
|
|
"epoch": 0.528,
|
||
|
|
"grad_norm": 0.0011872347677126527,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_tokens": 558178365.0,
|
||
|
|
"reward": 1.1743324041366576,
|
||
|
|
"reward_std": 0.09865072965621949,
|
||
|
|
"rewards/accuracy_reward": 0.53583984375,
|
||
|
|
"rewards/brier_reward": 0.8131169199943542,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0087890625,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.5171630859375,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 908.0,
|
||
|
|
"completions/max_terminated_length": 513.6,
|
||
|
|
"completions/mean_length": 224.51904296875,
|
||
|
|
"completions/mean_terminated_length": 224.26331481933593,
|
||
|
|
"completions/min_length": 103.4,
|
||
|
|
"completions/min_terminated_length": 103.4,
|
||
|
|
"epoch": 0.544,
|
||
|
|
"grad_norm": 0.0013052740832790732,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_tokens": 575641024.0,
|
||
|
|
"reward": 1.180383038520813,
|
||
|
|
"reward_std": 0.10921536087989807,
|
||
|
|
"rewards/accuracy_reward": 0.56044921875,
|
||
|
|
"rewards/brier_reward": 0.80060875415802,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00556640625,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.5579150438308715,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 497.4,
|
||
|
|
"completions/max_terminated_length": 497.4,
|
||
|
|
"completions/mean_length": 223.90068359375,
|
||
|
|
"completions/mean_terminated_length": 223.90068359375,
|
||
|
|
"completions/min_length": 108.6,
|
||
|
|
"completions/min_terminated_length": 108.6,
|
||
|
|
"epoch": 0.56,
|
||
|
|
"grad_norm": 0.0013972694287076592,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 592755175.0,
|
||
|
|
"reward": 1.1710703134536744,
|
||
|
|
"reward_std": 0.10018587708473206,
|
||
|
|
"rewards/accuracy_reward": 0.53154296875,
|
||
|
|
"rewards/brier_reward": 0.8105966329574585,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00439453125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5557578206062317,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 913.2,
|
||
|
|
"completions/max_terminated_length": 470.2,
|
||
|
|
"completions/mean_length": 222.59306640625,
|
||
|
|
"completions/mean_terminated_length": 222.33611450195312,
|
||
|
|
"completions/min_length": 112.2,
|
||
|
|
"completions/min_terminated_length": 112.2,
|
||
|
|
"epoch": 0.576,
|
||
|
|
"grad_norm": 0.0014351216377690434,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 610221152.0,
|
||
|
|
"reward": 1.1574892282485962,
|
||
|
|
"reward_std": 0.09260296672582627,
|
||
|
|
"rewards/accuracy_reward": 0.516796875,
|
||
|
|
"rewards/brier_reward": 0.7984735131263733,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00673828125,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.5536435604095459,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 476.6,
|
||
|
|
"completions/max_terminated_length": 476.6,
|
||
|
|
"completions/mean_length": 223.04169921875,
|
||
|
|
"completions/mean_terminated_length": 223.04169921875,
|
||
|
|
"completions/min_length": 112.4,
|
||
|
|
"completions/min_terminated_length": 112.4,
|
||
|
|
"epoch": 0.592,
|
||
|
|
"grad_norm": 0.002771401545032859,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0001,
|
||
|
|
"num_tokens": 627672811.0,
|
||
|
|
"reward": 1.1642754554748536,
|
||
|
|
"reward_std": 0.09222442507743836,
|
||
|
|
"rewards/accuracy_reward": 0.52421875,
|
||
|
|
"rewards/brier_reward": 0.80452641248703,
|
||
|
|
"rewards/confidence_one_or_zero": 0.008984375,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5350224733352661,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 928.8,
|
||
|
|
"completions/max_terminated_length": 539.6,
|
||
|
|
"completions/mean_length": 225.85869140625,
|
||
|
|
"completions/mean_terminated_length": 225.60252990722657,
|
||
|
|
"completions/min_length": 113.2,
|
||
|
|
"completions/min_terminated_length": 113.2,
|
||
|
|
"epoch": 0.608,
|
||
|
|
"grad_norm": 0.0009160145418718457,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_tokens": 644985092.0,
|
||
|
|
"reward": 1.1762210130691528,
|
||
|
|
"reward_std": 0.0809452623128891,
|
||
|
|
"rewards/accuracy_reward": 0.52841796875,
|
||
|
|
"rewards/brier_reward": 0.8242184281349182,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01025390625,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5095595717430115,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 490.2,
|
||
|
|
"completions/max_terminated_length": 490.2,
|
||
|
|
"completions/mean_length": 225.5041015625,
|
||
|
|
"completions/mean_terminated_length": 225.5041015625,
|
||
|
|
"completions/min_length": 107.4,
|
||
|
|
"completions/min_terminated_length": 107.4,
|
||
|
|
"epoch": 0.624,
|
||
|
|
"grad_norm": 0.0012107606744393706,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0001,
|
||
|
|
"num_tokens": 662638158.0,
|
||
|
|
"reward": 1.1756139755249024,
|
||
|
|
"reward_std": 0.09611473232507706,
|
||
|
|
"rewards/accuracy_reward": 0.531640625,
|
||
|
|
"rewards/brier_reward": 0.8195863008499146,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01044921875,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.515623027086258,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0001953125,
|
||
|
|
"completions/max_length": 902.8,
|
||
|
|
"completions/max_terminated_length": 522.0,
|
||
|
|
"completions/mean_length": 229.905078125,
|
||
|
|
"completions/mean_terminated_length": 229.64991760253906,
|
||
|
|
"completions/min_length": 114.4,
|
||
|
|
"completions/min_terminated_length": 114.4,
|
||
|
|
"epoch": 0.64,
|
||
|
|
"grad_norm": 0.0014958431711420417,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_tokens": 680335074.0,
|
||
|
|
"reward": 1.1860469579696655,
|
||
|
|
"reward_std": 0.09416615813970566,
|
||
|
|
"rewards/accuracy_reward": 0.562890625,
|
||
|
|
"rewards/brier_reward": 0.8094952344894409,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01044921875,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.5647656202316285,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.64,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 376.0,
|
||
|
|
"eval_completions/max_terminated_length": 376.0,
|
||
|
|
"eval_completions/mean_length": 231.17133712768555,
|
||
|
|
"eval_completions/mean_terminated_length": 231.17133712768555,
|
||
|
|
"eval_completions/min_length": 133.75,
|
||
|
|
"eval_completions/min_terminated_length": 133.75,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 680335074.0,
|
||
|
|
"eval_reward": 1.113263338804245,
|
||
|
|
"eval_reward_std": 0.29021773487329483,
|
||
|
|
"eval_rewards/accuracy_reward": 0.43359375,
|
||
|
|
"eval_rewards/brier_reward": 0.7929318398237228,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.009765625,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.5252539217472076,
|
||
|
|
"eval_runtime": 22.36,
|
||
|
|
"eval_samples_per_second": 22.361,
|
||
|
|
"eval_steps_per_second": 0.179,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 718.8,
|
||
|
|
"completions/max_terminated_length": 514.2,
|
||
|
|
"completions/mean_length": 229.0859375,
|
||
|
|
"completions/mean_terminated_length": 228.9575408935547,
|
||
|
|
"completions/min_length": 106.6,
|
||
|
|
"completions/min_terminated_length": 106.6,
|
||
|
|
"epoch": 0.656,
|
||
|
|
"grad_norm": 0.0010967873968183994,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": -0.0001,
|
||
|
|
"num_tokens": 697537458.0,
|
||
|
|
"reward": 1.1436767816543578,
|
||
|
|
"reward_std": 0.10650671422481536,
|
||
|
|
"rewards/accuracy_reward": 0.500390625,
|
||
|
|
"rewards/brier_reward": 0.7871571063995362,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00869140625,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5562148451805115,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 501.4,
|
||
|
|
"completions/max_terminated_length": 501.4,
|
||
|
|
"completions/mean_length": 230.0017578125,
|
||
|
|
"completions/mean_terminated_length": 230.0017578125,
|
||
|
|
"completions/min_length": 113.6,
|
||
|
|
"completions/min_terminated_length": 113.6,
|
||
|
|
"epoch": 0.672,
|
||
|
|
"grad_norm": 0.0009973476408049464,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 714806116.0,
|
||
|
|
"reward": 1.1648722887039185,
|
||
|
|
"reward_std": 0.0889286831021309,
|
||
|
|
"rewards/accuracy_reward": 0.51943359375,
|
||
|
|
"rewards/brier_reward": 0.8103100538253785,
|
||
|
|
"rewards/confidence_one_or_zero": 0.01181640625,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.534494137763977,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 771.2,
|
||
|
|
"completions/max_terminated_length": 594.6,
|
||
|
|
"completions/mean_length": 229.80078125,
|
||
|
|
"completions/mean_terminated_length": 229.6729309082031,
|
||
|
|
"completions/min_length": 108.2,
|
||
|
|
"completions/min_terminated_length": 108.2,
|
||
|
|
"epoch": 0.688,
|
||
|
|
"grad_norm": 0.003850990440696478,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_tokens": 732113196.0,
|
||
|
|
"reward": 1.170497512817383,
|
||
|
|
"reward_std": 0.09079546928405761,
|
||
|
|
"rewards/accuracy_reward": 0.5337890625,
|
||
|
|
"rewards/brier_reward": 0.8074003338813782,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00693359375,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5142109453678131,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 434.0,
|
||
|
|
"completions/max_terminated_length": 434.0,
|
||
|
|
"completions/mean_length": 224.29423828125,
|
||
|
|
"completions/mean_terminated_length": 224.29423828125,
|
||
|
|
"completions/min_length": 109.6,
|
||
|
|
"completions/min_terminated_length": 109.6,
|
||
|
|
"epoch": 0.704,
|
||
|
|
"grad_norm": 0.000887486501596868,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_tokens": 749276113.0,
|
||
|
|
"reward": 1.1758257865905761,
|
||
|
|
"reward_std": 0.08791445046663285,
|
||
|
|
"rewards/accuracy_reward": 0.53662109375,
|
||
|
|
"rewards/brier_reward": 0.8150294065475464,
|
||
|
|
"rewards/confidence_one_or_zero": 0.010546875,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5334873080253602,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 443.8,
|
||
|
|
"completions/max_terminated_length": 443.8,
|
||
|
|
"completions/mean_length": 221.7052734375,
|
||
|
|
"completions/mean_terminated_length": 221.7052734375,
|
||
|
|
"completions/min_length": 106.2,
|
||
|
|
"completions/min_terminated_length": 106.2,
|
||
|
|
"epoch": 0.72,
|
||
|
|
"grad_norm": 0.001327179721556604,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0007,
|
||
|
|
"num_tokens": 766556231.0,
|
||
|
|
"reward": 1.1897984266281127,
|
||
|
|
"reward_std": 0.08169474899768829,
|
||
|
|
"rewards/accuracy_reward": 0.56044921875,
|
||
|
|
"rewards/brier_reward": 0.8191466093063354,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00361328125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5215481758117676,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 440.6,
|
||
|
|
"completions/max_terminated_length": 440.6,
|
||
|
|
"completions/mean_length": 220.46484375,
|
||
|
|
"completions/mean_terminated_length": 220.46484375,
|
||
|
|
"completions/min_length": 108.6,
|
||
|
|
"completions/min_terminated_length": 108.6,
|
||
|
|
"epoch": 0.736,
|
||
|
|
"grad_norm": 0.0015731732128188014,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_tokens": 783753375.0,
|
||
|
|
"reward": 1.1876837491989136,
|
||
|
|
"reward_std": 0.08695107698440552,
|
||
|
|
"rewards/accuracy_reward": 0.566015625,
|
||
|
|
"rewards/brier_reward": 0.8093507885932922,
|
||
|
|
"rewards/confidence_one_or_zero": 0.003515625,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5321669936180115,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 644.6,
|
||
|
|
"completions/max_terminated_length": 438.8,
|
||
|
|
"completions/mean_length": 220.48974609375,
|
||
|
|
"completions/mean_terminated_length": 220.36173095703126,
|
||
|
|
"completions/min_length": 106.8,
|
||
|
|
"completions/min_terminated_length": 106.8,
|
||
|
|
"epoch": 0.752,
|
||
|
|
"grad_norm": 0.0010616250801831484,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 801238390.0,
|
||
|
|
"reward": 1.1859095811843872,
|
||
|
|
"reward_std": 0.08611637353897095,
|
||
|
|
"rewards/accuracy_reward": 0.561328125,
|
||
|
|
"rewards/brier_reward": 0.8105875492095947,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00380859375,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5318828105926514,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 500.2,
|
||
|
|
"completions/max_terminated_length": 500.2,
|
||
|
|
"completions/mean_length": 230.7822265625,
|
||
|
|
"completions/mean_terminated_length": 230.7822265625,
|
||
|
|
"completions/min_length": 117.0,
|
||
|
|
"completions/min_terminated_length": 117.0,
|
||
|
|
"epoch": 0.768,
|
||
|
|
"grad_norm": 0.0017465156270191073,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0008,
|
||
|
|
"num_tokens": 818534304.0,
|
||
|
|
"reward": 1.1670047283172607,
|
||
|
|
"reward_std": 0.09165385216474534,
|
||
|
|
"rewards/accuracy_reward": 0.51474609375,
|
||
|
|
"rewards/brier_reward": 0.8192623376846313,
|
||
|
|
"rewards/confidence_one_or_zero": 0.003125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5243632674217225,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 461.4,
|
||
|
|
"completions/max_terminated_length": 461.4,
|
||
|
|
"completions/mean_length": 234.57412109375,
|
||
|
|
"completions/mean_terminated_length": 234.57412109375,
|
||
|
|
"completions/min_length": 124.4,
|
||
|
|
"completions/min_terminated_length": 124.4,
|
||
|
|
"epoch": 0.784,
|
||
|
|
"grad_norm": 0.0015328590525314212,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_tokens": 836110711.0,
|
||
|
|
"reward": 1.1777088403701783,
|
||
|
|
"reward_std": 0.08737877309322357,
|
||
|
|
"rewards/accuracy_reward": 0.55556640625,
|
||
|
|
"rewards/brier_reward": 0.8000456333160401,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00361328125,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5257382750511169,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 496.8,
|
||
|
|
"completions/max_terminated_length": 496.8,
|
||
|
|
"completions/mean_length": 237.66533203125,
|
||
|
|
"completions/mean_terminated_length": 237.66533203125,
|
||
|
|
"completions/min_length": 126.0,
|
||
|
|
"completions/min_terminated_length": 126.0,
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 0.0011416386114433408,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0006,
|
||
|
|
"num_tokens": 853554964.0,
|
||
|
|
"reward": 1.2036036729812623,
|
||
|
|
"reward_std": 0.08387369066476821,
|
||
|
|
"rewards/accuracy_reward": 0.5857421875,
|
||
|
|
"rewards/brier_reward": 0.8214640617370605,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0037109375,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5544834017753602,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 385.75,
|
||
|
|
"eval_completions/max_terminated_length": 385.75,
|
||
|
|
"eval_completions/mean_length": 245.00457763671875,
|
||
|
|
"eval_completions/mean_terminated_length": 245.00457763671875,
|
||
|
|
"eval_completions/min_length": 139.0,
|
||
|
|
"eval_completions/min_terminated_length": 139.0,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 853554964.0,
|
||
|
|
"eval_reward": 1.12277153134346,
|
||
|
|
"eval_reward_std": 0.28607048839330673,
|
||
|
|
"eval_rewards/accuracy_reward": 0.421875,
|
||
|
|
"eval_rewards/brier_reward": 0.8236669898033142,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.0,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.5200195461511612,
|
||
|
|
"eval_runtime": 22.5943,
|
||
|
|
"eval_samples_per_second": 22.129,
|
||
|
|
"eval_steps_per_second": 0.177,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 516.2,
|
||
|
|
"completions/max_terminated_length": 516.2,
|
||
|
|
"completions/mean_length": 238.09716796875,
|
||
|
|
"completions/mean_terminated_length": 238.09716796875,
|
||
|
|
"completions/min_length": 119.6,
|
||
|
|
"completions/min_terminated_length": 119.6,
|
||
|
|
"epoch": 0.816,
|
||
|
|
"grad_norm": 0.0013309334171935916,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 871092247.0,
|
||
|
|
"reward": 1.1905385255813599,
|
||
|
|
"reward_std": 0.09376283437013626,
|
||
|
|
"rewards/accuracy_reward": 0.58505859375,
|
||
|
|
"rewards/brier_reward": 0.7960173130035401,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0037109375,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5715683579444886,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 518.4,
|
||
|
|
"completions/max_terminated_length": 518.4,
|
||
|
|
"completions/mean_length": 245.5568359375,
|
||
|
|
"completions/mean_terminated_length": 245.5568359375,
|
||
|
|
"completions/min_length": 127.0,
|
||
|
|
"completions/min_terminated_length": 127.0,
|
||
|
|
"epoch": 0.832,
|
||
|
|
"grad_norm": 0.0011804981622844934,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_tokens": 888615101.0,
|
||
|
|
"reward": 1.1889414310455322,
|
||
|
|
"reward_std": 0.08597695529460907,
|
||
|
|
"rewards/accuracy_reward": 0.55439453125,
|
||
|
|
"rewards/brier_reward": 0.8234872579574585,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00224609375,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5678603529930115,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 555.8,
|
||
|
|
"completions/max_terminated_length": 555.8,
|
||
|
|
"completions/mean_length": 252.11484375,
|
||
|
|
"completions/mean_terminated_length": 252.11484375,
|
||
|
|
"completions/min_length": 131.0,
|
||
|
|
"completions/min_terminated_length": 131.0,
|
||
|
|
"epoch": 0.848,
|
||
|
|
"grad_norm": 0.001554572256281972,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0002,
|
||
|
|
"num_tokens": 906211125.0,
|
||
|
|
"reward": 1.1674025774002075,
|
||
|
|
"reward_std": 0.0894822582602501,
|
||
|
|
"rewards/accuracy_reward": 0.52626953125,
|
||
|
|
"rewards/brier_reward": 0.8085344791412353,
|
||
|
|
"rewards/confidence_one_or_zero": 9.765625e-05,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5744921803474426,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 795.2,
|
||
|
|
"completions/max_terminated_length": 764.6,
|
||
|
|
"completions/mean_length": 257.57587890625,
|
||
|
|
"completions/mean_terminated_length": 257.45111694335935,
|
||
|
|
"completions/min_length": 134.2,
|
||
|
|
"completions/min_terminated_length": 134.2,
|
||
|
|
"epoch": 0.864,
|
||
|
|
"grad_norm": 0.002882245695218444,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 923835518.0,
|
||
|
|
"reward": 1.2002546310424804,
|
||
|
|
"reward_std": 0.08959609419107437,
|
||
|
|
"rewards/accuracy_reward": 0.5888671875,
|
||
|
|
"rewards/brier_reward": 0.8117385983467102,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00087890625,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5740029335021972,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 777.2,
|
||
|
|
"completions/max_terminated_length": 575.4,
|
||
|
|
"completions/mean_length": 259.45830078125,
|
||
|
|
"completions/mean_terminated_length": 259.33391723632815,
|
||
|
|
"completions/min_length": 127.6,
|
||
|
|
"completions/min_terminated_length": 127.6,
|
||
|
|
"epoch": 0.88,
|
||
|
|
"grad_norm": 0.001367030548863113,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0003,
|
||
|
|
"num_tokens": 941639443.0,
|
||
|
|
"reward": 1.1616749525070191,
|
||
|
|
"reward_std": 0.08919112980365754,
|
||
|
|
"rewards/accuracy_reward": 0.51142578125,
|
||
|
|
"rewards/brier_reward": 0.8120206832885742,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00126953125,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5385517477989197,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 568.6,
|
||
|
|
"completions/max_terminated_length": 568.6,
|
||
|
|
"completions/mean_length": 261.7333984375,
|
||
|
|
"completions/mean_terminated_length": 261.7333984375,
|
||
|
|
"completions/min_length": 129.6,
|
||
|
|
"completions/min_terminated_length": 129.6,
|
||
|
|
"epoch": 0.896,
|
||
|
|
"grad_norm": 0.0019051535055041313,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 959430441.0,
|
||
|
|
"reward": 1.1740495443344117,
|
||
|
|
"reward_std": 0.08368157297372818,
|
||
|
|
"rewards/accuracy_reward": 0.5408203125,
|
||
|
|
"rewards/brier_reward": 0.8072776556015014,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00068359375,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5425273656845093,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 786.4,
|
||
|
|
"completions/max_terminated_length": 605.6,
|
||
|
|
"completions/mean_length": 262.71474609375,
|
||
|
|
"completions/mean_terminated_length": 262.591015625,
|
||
|
|
"completions/min_length": 127.6,
|
||
|
|
"completions/min_terminated_length": 127.6,
|
||
|
|
"epoch": 0.912,
|
||
|
|
"grad_norm": 0.0032088656444102526,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_tokens": 977171936.0,
|
||
|
|
"reward": 1.1776597261428834,
|
||
|
|
"reward_std": 0.0879664734005928,
|
||
|
|
"rewards/accuracy_reward": 0.54130859375,
|
||
|
|
"rewards/brier_reward": 0.8142051458358764,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00048828125,
|
||
|
|
"rewards/format_reward": 0.9998046875,
|
||
|
|
"rewards/mean_confidence_reward": 0.5229726552963256,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 594.0,
|
||
|
|
"completions/max_terminated_length": 594.0,
|
||
|
|
"completions/mean_length": 255.93291015625,
|
||
|
|
"completions/mean_terminated_length": 255.93291015625,
|
||
|
|
"completions/min_length": 124.8,
|
||
|
|
"completions/min_terminated_length": 124.8,
|
||
|
|
"epoch": 0.928,
|
||
|
|
"grad_norm": 0.0023952668998390436,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0001,
|
||
|
|
"num_tokens": 994819505.0,
|
||
|
|
"reward": 1.1611377239227294,
|
||
|
|
"reward_std": 0.08492133468389511,
|
||
|
|
"rewards/accuracy_reward": 0.5228515625,
|
||
|
|
"rewards/brier_reward": 0.7994229435920716,
|
||
|
|
"rewards/confidence_one_or_zero": 0.00078125,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5169863283634186,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 775.0,
|
||
|
|
"completions/max_terminated_length": 598.2,
|
||
|
|
"completions/mean_length": 257.284765625,
|
||
|
|
"completions/mean_terminated_length": 257.160205078125,
|
||
|
|
"completions/min_length": 120.0,
|
||
|
|
"completions/min_terminated_length": 120.0,
|
||
|
|
"epoch": 0.944,
|
||
|
|
"grad_norm": 0.001820826786570251,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"num_tokens": 1012429525.0,
|
||
|
|
"reward": 1.173994493484497,
|
||
|
|
"reward_std": 0.08908755034208297,
|
||
|
|
"rewards/accuracy_reward": 0.53642578125,
|
||
|
|
"rewards/brier_reward": 0.8116598725318909,
|
||
|
|
"rewards/confidence_one_or_zero": 0.000390625,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5185712933540344,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 602.4,
|
||
|
|
"completions/max_terminated_length": 602.4,
|
||
|
|
"completions/mean_length": 259.2900390625,
|
||
|
|
"completions/mean_terminated_length": 259.2900390625,
|
||
|
|
"completions/min_length": 131.4,
|
||
|
|
"completions/min_terminated_length": 131.4,
|
||
|
|
"epoch": 0.96,
|
||
|
|
"grad_norm": 0.004218410234898329,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 1030024975.0,
|
||
|
|
"reward": 1.1701999187469483,
|
||
|
|
"reward_std": 0.07609933465719224,
|
||
|
|
"rewards/accuracy_reward": 0.5234375,
|
||
|
|
"rewards/brier_reward": 0.8172542452812195,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0001953125,
|
||
|
|
"rewards/format_reward": 0.99970703125,
|
||
|
|
"rewards/mean_confidence_reward": 0.5500380873680115,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.96,
|
||
|
|
"eval_completions/clipped_ratio": 0.0,
|
||
|
|
"eval_completions/max_length": 430.5,
|
||
|
|
"eval_completions/max_terminated_length": 430.5,
|
||
|
|
"eval_completions/mean_length": 257.0390625,
|
||
|
|
"eval_completions/mean_terminated_length": 257.0390625,
|
||
|
|
"eval_completions/min_length": 164.25,
|
||
|
|
"eval_completions/min_terminated_length": 164.25,
|
||
|
|
"eval_loss": 0.0,
|
||
|
|
"eval_num_tokens": 1030024975.0,
|
||
|
|
"eval_reward": 1.1227055788040161,
|
||
|
|
"eval_reward_std": 0.2832227647304535,
|
||
|
|
"eval_rewards/accuracy_reward": 0.439453125,
|
||
|
|
"eval_rewards/brier_reward": 0.8059570342302322,
|
||
|
|
"eval_rewards/confidence_one_or_zero": 0.0,
|
||
|
|
"eval_rewards/format_reward": 1.0,
|
||
|
|
"eval_rewards/mean_confidence_reward": 0.5294921696186066,
|
||
|
|
"eval_runtime": 25.4806,
|
||
|
|
"eval_samples_per_second": 19.623,
|
||
|
|
"eval_steps_per_second": 0.157,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 605.2,
|
||
|
|
"completions/max_terminated_length": 605.2,
|
||
|
|
"completions/mean_length": 258.0880859375,
|
||
|
|
"completions/mean_terminated_length": 258.0880859375,
|
||
|
|
"completions/min_length": 130.4,
|
||
|
|
"completions/min_terminated_length": 130.4,
|
||
|
|
"epoch": 0.976,
|
||
|
|
"grad_norm": 0.0010340906446799636,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0005,
|
||
|
|
"num_tokens": 1047528917.0,
|
||
|
|
"reward": 1.1855917692184448,
|
||
|
|
"reward_std": 0.07872401475906372,
|
||
|
|
"rewards/accuracy_reward": 0.5564453125,
|
||
|
|
"rewards/brier_reward": 0.8147371768951416,
|
||
|
|
"rewards/confidence_one_or_zero": 9.765625e-05,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5425136804580688,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 9.765625e-05,
|
||
|
|
"completions/max_length": 781.0,
|
||
|
|
"completions/max_terminated_length": 594.0,
|
||
|
|
"completions/mean_length": 255.09814453125,
|
||
|
|
"completions/mean_terminated_length": 254.97261962890624,
|
||
|
|
"completions/min_length": 130.0,
|
||
|
|
"completions/min_terminated_length": 130.0,
|
||
|
|
"epoch": 0.992,
|
||
|
|
"grad_norm": 0.016297942027449608,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0004,
|
||
|
|
"num_tokens": 1065269602.0,
|
||
|
|
"reward": 1.166081428527832,
|
||
|
|
"reward_std": 0.07803938686847686,
|
||
|
|
"rewards/accuracy_reward": 0.521484375,
|
||
|
|
"rewards/brier_reward": 0.8107750654220581,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0001953125,
|
||
|
|
"rewards/format_reward": 0.99990234375,
|
||
|
|
"rewards/mean_confidence_reward": 0.5243340075016022,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 579.0,
|
||
|
|
"completions/max_terminated_length": 579.0,
|
||
|
|
"completions/mean_length": 257.6454162597656,
|
||
|
|
"completions/mean_terminated_length": 257.6454162597656,
|
||
|
|
"completions/min_length": 129.5,
|
||
|
|
"completions/min_terminated_length": 129.5,
|
||
|
|
"epoch": 0.9984,
|
||
|
|
"num_tokens": 1072324341.0,
|
||
|
|
"reward": 1.1739696860313416,
|
||
|
|
"reward_std": 0.08398981019854546,
|
||
|
|
"rewards/accuracy_reward": 0.55712890625,
|
||
|
|
"rewards/brier_reward": 0.7908094227313995,
|
||
|
|
"rewards/confidence_one_or_zero": 0.0,
|
||
|
|
"rewards/format_reward": 1.0,
|
||
|
|
"rewards/mean_confidence_reward": 0.5394628942012787,
|
||
|
|
"step": 312,
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_loss": 0.004670050070182277,
|
||
|
|
"train_runtime": 88432.5734,
|
||
|
|
"train_samples_per_second": 0.226,
|
||
|
|
"train_steps_per_second": 0.004
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 312,
|
||
|
|
"num_input_tokens_seen": 1072324341,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 60,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|