Files
hotpot-v2-brier-7b-no-split/trainer_state.json

1875 lines
69 KiB
JSON
Raw Permalink Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 50,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0400390625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1521.8,
"completions/mean_length": 276.6455078125,
"completions/mean_terminated_length": 224.1190185546875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.016,
"grad_norm": 0.05407445505261421,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0956,
"num_tokens": 17676882.0,
"reward": 0.6326022028923035,
"reward_std": 0.4947403073310852,
"rewards/accuracy_reward": 0.2208984375,
"rewards/brier_reward": 0.3710617899894714,
"rewards/confidence_one_or_zero": 0.27548828125,
"rewards/format_reward": 0.6732421875,
"rewards/mean_confidence_reward": 0.7399574875831604,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.034765625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1498.2,
"completions/mean_length": 258.75498046875,
"completions/mean_terminated_length": 212.7857635498047,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.032,
"grad_norm": 0.0354408323764801,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0885,
"num_tokens": 35426885.0,
"reward": 0.6595678806304932,
"reward_std": 0.46407333612442014,
"rewards/accuracy_reward": 0.21484375,
"rewards/brier_reward": 0.38378217816352844,
"rewards/confidence_one_or_zero": 0.26513671875,
"rewards/format_reward": 0.7205078125,
"rewards/mean_confidence_reward": 0.7485471248626709,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0203125,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1443.6,
"completions/mean_length": 203.36826171875,
"completions/mean_terminated_length": 175.87453918457032,
"completions/min_length": 1.8,
"completions/min_terminated_length": 1.8,
"epoch": 0.048,
"grad_norm": 0.030709726735949516,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0683,
"num_tokens": 52558112.0,
"reward": 0.8185818791389465,
"reward_std": 0.37540732622146605,
"rewards/accuracy_reward": 0.2767578125,
"rewards/brier_reward": 0.4851109802722931,
"rewards/confidence_one_or_zero": 0.26318359375,
"rewards/format_reward": 0.87529296875,
"rewards/mean_confidence_reward": 0.763912582397461,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0048828125,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1206.6,
"completions/mean_length": 136.7234375,
"completions/mean_terminated_length": 129.86282653808593,
"completions/min_length": 7.4,
"completions/min_terminated_length": 7.4,
"epoch": 0.064,
"grad_norm": 0.030825674533843994,
"learning_rate": 1e-06,
"loss": 0.0144,
"num_tokens": 68876560.0,
"reward": 0.944485855102539,
"reward_std": 0.2918001413345337,
"rewards/accuracy_reward": 0.336328125,
"rewards/brier_reward": 0.5843800187110901,
"rewards/confidence_one_or_zero": 0.20009765625,
"rewards/format_reward": 0.96826171875,
"rewards/mean_confidence_reward": 0.7416761994361878,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00166015625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 858.0,
"completions/mean_length": 121.93955078125,
"completions/mean_terminated_length": 119.58839721679688,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.08,
"grad_norm": 0.07219453901052475,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 85058373.0,
"reward": 1.0135140299797059,
"reward_std": 0.22123381197452546,
"rewards/accuracy_reward": 0.3626953125,
"rewards/brier_reward": 0.6754642128944397,
"rewards/confidence_one_or_zero": 0.083203125,
"rewards/format_reward": 0.9888671875,
"rewards/mean_confidence_reward": 0.6404195070266724,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1187.0,
"completions/max_terminated_length": 636.0,
"completions/mean_length": 122.44267578125,
"completions/mean_terminated_length": 121.89026336669922,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.096,
"grad_norm": 0.0071799191646277905,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 101356794.0,
"reward": 1.050560426712036,
"reward_std": 0.16749218702316285,
"rewards/accuracy_reward": 0.36787109375,
"rewards/brier_reward": 0.7357877850532532,
"rewards/confidence_one_or_zero": 0.04443359375,
"rewards/format_reward": 0.9974609375,
"rewards/mean_confidence_reward": 0.5114866554737091,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0005859375,
"completions/max_length": 822.8,
"completions/max_terminated_length": 457.6,
"completions/mean_length": 125.12060546875,
"completions/mean_terminated_length": 124.29327087402343,
"completions/min_length": 40.8,
"completions/min_terminated_length": 40.8,
"epoch": 0.112,
"grad_norm": 0.012783159501850605,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 117747501.0,
"reward": 1.0780974626541138,
"reward_std": 0.12303584218025207,
"rewards/accuracy_reward": 0.40283203125,
"rewards/brier_reward": 0.7566823959350586,
"rewards/confidence_one_or_zero": 0.05263671875,
"rewards/format_reward": 0.9966796875,
"rewards/mean_confidence_reward": 0.3563989281654358,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1302.2,
"completions/max_terminated_length": 380.2,
"completions/mean_length": 131.248828125,
"completions/mean_terminated_length": 130.69945831298827,
"completions/min_length": 44.2,
"completions/min_terminated_length": 44.2,
"epoch": 0.128,
"grad_norm": 0.03782346472144127,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 134008161.0,
"reward": 1.0803744792938232,
"reward_std": 0.09947807043790817,
"rewards/accuracy_reward": 0.40439453125,
"rewards/brier_reward": 0.7587952256202698,
"rewards/confidence_one_or_zero": 0.04990234375,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.30792068839073183,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 767.4,
"completions/max_terminated_length": 526.6,
"completions/mean_length": 134.51650390625,
"completions/mean_terminated_length": 134.37986755371094,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.144,
"grad_norm": 0.0044364649802446365,
"learning_rate": 1e-06,
"loss": -0.0005,
"num_tokens": 150336042.0,
"reward": 1.1184379577636718,
"reward_std": 0.10152655839920044,
"rewards/accuracy_reward": 0.5048828125,
"rewards/brier_reward": 0.7332618951797485,
"rewards/confidence_one_or_zero": 0.03076171875,
"rewards/format_reward": 0.99873046875,
"rewards/mean_confidence_reward": 0.33715721368789675,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 596.4,
"completions/max_terminated_length": 347.8,
"completions/mean_length": 141.1640625,
"completions/mean_terminated_length": 141.02822570800782,
"completions/min_length": 53.6,
"completions/min_terminated_length": 53.6,
"epoch": 0.16,
"grad_norm": 0.0031058751046657562,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 166802490.0,
"reward": 1.1045508146286012,
"reward_std": 0.10970858335494996,
"rewards/accuracy_reward": 0.4513671875,
"rewards/brier_reward": 0.7593937039375305,
"rewards/confidence_one_or_zero": 0.01396484375,
"rewards/format_reward": 0.99833984375,
"rewards/mean_confidence_reward": 0.4095295906066895,
"step": 50
},
{
"epoch": 0.16,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 249.75,
"eval_completions/max_terminated_length": 249.75,
"eval_completions/mean_length": 143.74979782104492,
"eval_completions/mean_terminated_length": 143.74979782104492,
"eval_completions/min_length": 72.75,
"eval_completions/min_terminated_length": 72.75,
"eval_loss": 0.0,
"eval_num_tokens": 166802490.0,
"eval_reward": 1.0683082938194275,
"eval_reward_std": 0.22595830261707306,
"eval_rewards/accuracy_reward": 0.3671875,
"eval_rewards/brier_reward": 0.7694281339645386,
"eval_rewards/confidence_one_or_zero": 0.0078125,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.4460156336426735,
"eval_runtime": 17.2028,
"eval_samples_per_second": 29.065,
"eval_steps_per_second": 0.233,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 356.0,
"completions/max_terminated_length": 356.0,
"completions/mean_length": 147.52265625,
"completions/mean_terminated_length": 147.52265625,
"completions/min_length": 56.2,
"completions/min_terminated_length": 56.2,
"epoch": 0.176,
"grad_norm": 0.008834286592900753,
"learning_rate": 1e-06,
"loss": -0.0005,
"num_tokens": 183550242.0,
"reward": 1.105972409248352,
"reward_std": 0.10724246203899383,
"rewards/accuracy_reward": 0.44873046875,
"rewards/brier_reward": 0.7641900300979614,
"rewards/confidence_one_or_zero": 0.01337890625,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.4539414048194885,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 342.0,
"completions/max_terminated_length": 342.0,
"completions/mean_length": 155.03759765625,
"completions/mean_terminated_length": 155.03759765625,
"completions/min_length": 68.2,
"completions/min_terminated_length": 68.2,
"epoch": 0.192,
"grad_norm": 0.0015183566138148308,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 199952643.0,
"reward": 1.1174006700515746,
"reward_std": 0.10823124945163727,
"rewards/accuracy_reward": 0.4720703125,
"rewards/brier_reward": 0.7630230784416199,
"rewards/confidence_one_or_zero": 0.00830078125,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.4763046860694885,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 429.8,
"completions/max_terminated_length": 429.8,
"completions/mean_length": 164.254296875,
"completions/mean_terminated_length": 164.254296875,
"completions/min_length": 82.2,
"completions/min_terminated_length": 82.2,
"epoch": 0.208,
"grad_norm": 0.0033636174630373716,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 216666831.0,
"reward": 1.144743847846985,
"reward_std": 0.11080079525709152,
"rewards/accuracy_reward": 0.52763671875,
"rewards/brier_reward": 0.7621429681777954,
"rewards/confidence_one_or_zero": 0.008203125,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.48511279225349424,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 674.8,
"completions/max_terminated_length": 462.6,
"completions/mean_length": 168.61201171875,
"completions/mean_terminated_length": 168.47879638671876,
"completions/min_length": 77.4,
"completions/min_terminated_length": 77.4,
"epoch": 0.224,
"grad_norm": 0.0016244107391685247,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 233546602.0,
"reward": 1.1177247285842895,
"reward_std": 0.10474657416343688,
"rewards/accuracy_reward": 0.46884765625,
"rewards/brier_reward": 0.7668938279151917,
"rewards/confidence_one_or_zero": 0.00947265625,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.4841289222240448,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 651.0,
"completions/max_terminated_length": 417.8,
"completions/mean_length": 174.24892578125,
"completions/mean_terminated_length": 174.11595458984374,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.24,
"grad_norm": 0.002639307640492916,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 250582591.0,
"reward": 1.1502854824066162,
"reward_std": 0.12000200897455215,
"rewards/accuracy_reward": 0.53818359375,
"rewards/brier_reward": 0.7634606242179871,
"rewards/confidence_one_or_zero": 0.005078125,
"rewards/format_reward": 0.99892578125,
"rewards/mean_confidence_reward": 0.489949232339859,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 843.8,
"completions/max_terminated_length": 765.6,
"completions/mean_length": 175.550390625,
"completions/mean_terminated_length": 175.02010498046874,
"completions/min_length": 81.6,
"completions/min_terminated_length": 81.6,
"epoch": 0.256,
"grad_norm": 0.02150336280465126,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 267435043.0,
"reward": 1.1393208265304566,
"reward_std": 0.11671655029058456,
"rewards/accuracy_reward": 0.5083984375,
"rewards/brier_reward": 0.7709258198738098,
"rewards/confidence_one_or_zero": 0.01162109375,
"rewards/format_reward": 0.99931640625,
"rewards/mean_confidence_reward": 0.48744922280311587,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 183.68974609375,
"completions/mean_terminated_length": 183.68974609375,
"completions/min_length": 79.4,
"completions/min_terminated_length": 79.4,
"epoch": 0.272,
"grad_norm": 0.0036507430486381054,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 284281722.0,
"reward": 1.1324650287628173,
"reward_std": 0.11320338100194931,
"rewards/accuracy_reward": 0.4955078125,
"rewards/brier_reward": 0.7696167230606079,
"rewards/confidence_one_or_zero": 0.01044921875,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.48797558546066283,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 675.0,
"completions/max_terminated_length": 453.2,
"completions/mean_length": 183.3435546875,
"completions/mean_terminated_length": 183.211669921875,
"completions/min_length": 71.6,
"completions/min_terminated_length": 71.6,
"epoch": 0.288,
"grad_norm": 0.00159507489297539,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 301117336.0,
"reward": 1.1380449295043946,
"reward_std": 0.11923972368240357,
"rewards/accuracy_reward": 0.50419921875,
"rewards/brier_reward": 0.7722803950309753,
"rewards/confidence_one_or_zero": 0.00966796875,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.4882441520690918,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 729.4,
"completions/max_terminated_length": 545.0,
"completions/mean_length": 188.354296875,
"completions/mean_terminated_length": 188.22288818359374,
"completions/min_length": 74.2,
"completions/min_terminated_length": 74.2,
"epoch": 0.304,
"grad_norm": 0.0020240871235728264,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 317976036.0,
"reward": 1.1388062238693237,
"reward_std": 0.11345189213752746,
"rewards/accuracy_reward": 0.50478515625,
"rewards/brier_reward": 0.7739005923271179,
"rewards/confidence_one_or_zero": 0.01728515625,
"rewards/format_reward": 0.99892578125,
"rewards/mean_confidence_reward": 0.48295703530311584,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 662.4,
"completions/max_terminated_length": 447.6,
"completions/mean_length": 191.061328125,
"completions/mean_terminated_length": 190.9298309326172,
"completions/min_length": 86.8,
"completions/min_terminated_length": 86.8,
"epoch": 0.32,
"grad_norm": 0.0013722889125347137,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 335021208.0,
"reward": 1.1509589195251464,
"reward_std": 0.1035462662577629,
"rewards/accuracy_reward": 0.52197265625,
"rewards/brier_reward": 0.7800418734550476,
"rewards/confidence_one_or_zero": 0.01318359375,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.4988081157207489,
"step": 100
},
{
"epoch": 0.32,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 411.75,
"eval_completions/max_terminated_length": 411.75,
"eval_completions/mean_length": 195.53663635253906,
"eval_completions/mean_terminated_length": 195.53663635253906,
"eval_completions/min_length": 107.25,
"eval_completions/min_terminated_length": 107.25,
"eval_loss": 0.0,
"eval_num_tokens": 335021208.0,
"eval_reward": 1.0887417793273926,
"eval_reward_std": 0.25174758210778236,
"eval_rewards/accuracy_reward": 0.400390625,
"eval_rewards/brier_reward": 0.7770919799804688,
"eval_rewards/confidence_one_or_zero": 0.013671875,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.46802735328674316,
"eval_runtime": 22.2738,
"eval_samples_per_second": 22.448,
"eval_steps_per_second": 0.18,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1125.0,
"completions/max_terminated_length": 479.8,
"completions/mean_length": 196.13671875,
"completions/mean_terminated_length": 195.61272583007812,
"completions/min_length": 92.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.336,
"grad_norm": 0.0013452547136694193,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 351752080.0,
"reward": 1.1555601119995118,
"reward_std": 0.11393154710531235,
"rewards/accuracy_reward": 0.52939453125,
"rewards/brier_reward": 0.7823106408119201,
"rewards/confidence_one_or_zero": 0.01396484375,
"rewards/format_reward": 0.9994140625,
"rewards/mean_confidence_reward": 0.49622313380241395,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 908.0,
"completions/max_terminated_length": 493.8,
"completions/mean_length": 199.18369140625,
"completions/mean_terminated_length": 198.923095703125,
"completions/min_length": 97.6,
"completions/min_terminated_length": 97.6,
"epoch": 0.352,
"grad_norm": 0.002751865889877081,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 369052137.0,
"reward": 1.1255475521087646,
"reward_std": 0.11049925088882447,
"rewards/accuracy_reward": 0.4666015625,
"rewards/brier_reward": 0.7846879482269287,
"rewards/confidence_one_or_zero": 0.01533203125,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.4882112622261047,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 627.8,
"completions/max_terminated_length": 627.8,
"completions/mean_length": 203.8998046875,
"completions/mean_terminated_length": 203.8998046875,
"completions/min_length": 95.6,
"completions/min_terminated_length": 95.6,
"epoch": 0.368,
"grad_norm": 0.002145805163308978,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 386205543.0,
"reward": 1.1361007690429688,
"reward_std": 0.1053838849067688,
"rewards/accuracy_reward": 0.48994140625,
"rewards/brier_reward": 0.7822591662406921,
"rewards/confidence_one_or_zero": 0.01376953125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5061289191246032,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 659.6,
"completions/max_terminated_length": 659.6,
"completions/mean_length": 201.56748046875,
"completions/mean_terminated_length": 201.56748046875,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.384,
"grad_norm": 0.002008226700127125,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 403126106.0,
"reward": 1.1584414958953857,
"reward_std": 0.10435761213302612,
"rewards/accuracy_reward": 0.521484375,
"rewards/brier_reward": 0.7953975558280945,
"rewards/confidence_one_or_zero": 0.01845703125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5107519447803497,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 486.2,
"completions/max_terminated_length": 486.2,
"completions/mean_length": 201.58603515625,
"completions/mean_terminated_length": 201.58603515625,
"completions/min_length": 96.8,
"completions/min_terminated_length": 96.8,
"epoch": 0.4,
"grad_norm": 0.004599791020154953,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 420226795.0,
"reward": 1.1391048192977906,
"reward_std": 0.11113806515932083,
"rewards/accuracy_reward": 0.4962890625,
"rewards/brier_reward": 0.781919538974762,
"rewards/confidence_one_or_zero": 0.0142578125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5150127053260803,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 573.2,
"completions/max_terminated_length": 573.2,
"completions/mean_length": 201.14990234375,
"completions/mean_terminated_length": 201.14990234375,
"completions/min_length": 98.6,
"completions/min_terminated_length": 98.6,
"epoch": 0.416,
"grad_norm": 0.0012156119337305427,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 437167754.0,
"reward": 1.150643491744995,
"reward_std": 0.1118047833442688,
"rewards/accuracy_reward": 0.50810546875,
"rewards/brier_reward": 0.7934735059738159,
"rewards/confidence_one_or_zero": 0.0142578125,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.4986997008323669,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 674.8,
"completions/max_terminated_length": 482.4,
"completions/mean_length": 200.10625,
"completions/mean_terminated_length": 199.97588195800782,
"completions/min_length": 95.4,
"completions/min_terminated_length": 95.4,
"epoch": 0.432,
"grad_norm": 0.006859796121716499,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 454231178.0,
"reward": 1.1733263969421386,
"reward_std": 0.10346025228500366,
"rewards/accuracy_reward": 0.54521484375,
"rewards/brier_reward": 0.8016322970390319,
"rewards/confidence_one_or_zero": 0.01435546875,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5186787366867065,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 719.8,
"completions/max_terminated_length": 518.8,
"completions/mean_length": 208.76640625,
"completions/mean_terminated_length": 208.63664855957032,
"completions/min_length": 96.8,
"completions/min_terminated_length": 96.8,
"epoch": 0.448,
"grad_norm": 0.0016618920490145683,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 471321746.0,
"reward": 1.1534050226211547,
"reward_std": 0.11015735268592834,
"rewards/accuracy_reward": 0.51103515625,
"rewards/brier_reward": 0.7959691882133484,
"rewards/confidence_one_or_zero": 0.0083984375,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5418408274650574,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 566.8,
"completions/max_terminated_length": 566.8,
"completions/mean_length": 215.3693359375,
"completions/mean_terminated_length": 215.3693359375,
"completions/min_length": 99.2,
"completions/min_terminated_length": 99.2,
"epoch": 0.464,
"grad_norm": 0.0014355273451656103,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 488697944.0,
"reward": 1.1193523406982422,
"reward_std": 0.10776209384202957,
"rewards/accuracy_reward": 0.458984375,
"rewards/brier_reward": 0.7799146294593811,
"rewards/confidence_one_or_zero": 0.011328125,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.520960932970047,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 752.6,
"completions/max_terminated_length": 530.4,
"completions/mean_length": 216.18486328125,
"completions/mean_terminated_length": 215.92719421386718,
"completions/min_length": 108.8,
"completions/min_terminated_length": 108.8,
"epoch": 0.48,
"grad_norm": 0.0018099879380315542,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 505959709.0,
"reward": 1.1509707689285278,
"reward_std": 0.10748694986104965,
"rewards/accuracy_reward": 0.512109375,
"rewards/brier_reward": 0.7900263905525208,
"rewards/confidence_one_or_zero": 0.01904296875,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.4951982319355011,
"step": 150
},
{
"epoch": 0.48,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 374.5,
"eval_completions/max_terminated_length": 374.5,
"eval_completions/mean_length": 216.64392471313477,
"eval_completions/mean_terminated_length": 216.64392471313477,
"eval_completions/min_length": 125.0,
"eval_completions/min_terminated_length": 125.0,
"eval_loss": 0.0,
"eval_num_tokens": 505959709.0,
"eval_reward": 1.1084575355052948,
"eval_reward_std": 0.2621918395161629,
"eval_rewards/accuracy_reward": 0.41015625,
"eval_rewards/brier_reward": 0.8067578077316284,
"eval_rewards/confidence_one_or_zero": 0.0234375,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.4730468988418579,
"eval_runtime": 21.7939,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 0.184,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 588.8,
"completions/max_terminated_length": 588.8,
"completions/mean_length": 218.93642578125,
"completions/mean_terminated_length": 218.93642578125,
"completions/min_length": 106.4,
"completions/min_terminated_length": 106.4,
"epoch": 0.496,
"grad_norm": 0.0012288556899875402,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 523509458.0,
"reward": 1.1674549341201783,
"reward_std": 0.0982852265238762,
"rewards/accuracy_reward": 0.5412109375,
"rewards/brier_reward": 0.7936979055404663,
"rewards/confidence_one_or_zero": 0.01201171875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5015585958957672,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 729.2,
"completions/max_terminated_length": 520.6,
"completions/mean_length": 218.68642578125,
"completions/mean_terminated_length": 218.5576599121094,
"completions/min_length": 110.6,
"completions/min_terminated_length": 110.6,
"epoch": 0.512,
"grad_norm": 0.0026432271115481853,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 540894471.0,
"reward": 1.1772954702377318,
"reward_std": 0.10269609093666077,
"rewards/accuracy_reward": 0.54892578125,
"rewards/brier_reward": 0.8057618141174316,
"rewards/confidence_one_or_zero": 0.00927734375,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5223603427410126,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00029296875,
"completions/max_length": 1145.4,
"completions/max_terminated_length": 564.4,
"completions/mean_length": 220.1521484375,
"completions/mean_terminated_length": 219.7665252685547,
"completions/min_length": 113.2,
"completions/min_terminated_length": 113.2,
"epoch": 0.528,
"grad_norm": 0.0011872347677126527,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 558178365.0,
"reward": 1.1743324041366576,
"reward_std": 0.09865072965621949,
"rewards/accuracy_reward": 0.53583984375,
"rewards/brier_reward": 0.8131169199943542,
"rewards/confidence_one_or_zero": 0.0087890625,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.5171630859375,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 908.0,
"completions/max_terminated_length": 513.6,
"completions/mean_length": 224.51904296875,
"completions/mean_terminated_length": 224.26331481933593,
"completions/min_length": 103.4,
"completions/min_terminated_length": 103.4,
"epoch": 0.544,
"grad_norm": 0.0013052740832790732,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 575641024.0,
"reward": 1.180383038520813,
"reward_std": 0.10921536087989807,
"rewards/accuracy_reward": 0.56044921875,
"rewards/brier_reward": 0.80060875415802,
"rewards/confidence_one_or_zero": 0.00556640625,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.5579150438308715,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 497.4,
"completions/max_terminated_length": 497.4,
"completions/mean_length": 223.90068359375,
"completions/mean_terminated_length": 223.90068359375,
"completions/min_length": 108.6,
"completions/min_terminated_length": 108.6,
"epoch": 0.56,
"grad_norm": 0.0013972694287076592,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 592755175.0,
"reward": 1.1710703134536744,
"reward_std": 0.10018587708473206,
"rewards/accuracy_reward": 0.53154296875,
"rewards/brier_reward": 0.8105966329574585,
"rewards/confidence_one_or_zero": 0.00439453125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5557578206062317,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 913.2,
"completions/max_terminated_length": 470.2,
"completions/mean_length": 222.59306640625,
"completions/mean_terminated_length": 222.33611450195312,
"completions/min_length": 112.2,
"completions/min_terminated_length": 112.2,
"epoch": 0.576,
"grad_norm": 0.0014351216377690434,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 610221152.0,
"reward": 1.1574892282485962,
"reward_std": 0.09260296672582627,
"rewards/accuracy_reward": 0.516796875,
"rewards/brier_reward": 0.7984735131263733,
"rewards/confidence_one_or_zero": 0.00673828125,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.5536435604095459,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.6,
"completions/max_terminated_length": 476.6,
"completions/mean_length": 223.04169921875,
"completions/mean_terminated_length": 223.04169921875,
"completions/min_length": 112.4,
"completions/min_terminated_length": 112.4,
"epoch": 0.592,
"grad_norm": 0.002771401545032859,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 627672811.0,
"reward": 1.1642754554748536,
"reward_std": 0.09222442507743836,
"rewards/accuracy_reward": 0.52421875,
"rewards/brier_reward": 0.80452641248703,
"rewards/confidence_one_or_zero": 0.008984375,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5350224733352661,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 928.8,
"completions/max_terminated_length": 539.6,
"completions/mean_length": 225.85869140625,
"completions/mean_terminated_length": 225.60252990722657,
"completions/min_length": 113.2,
"completions/min_terminated_length": 113.2,
"epoch": 0.608,
"grad_norm": 0.0009160145418718457,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 644985092.0,
"reward": 1.1762210130691528,
"reward_std": 0.0809452623128891,
"rewards/accuracy_reward": 0.52841796875,
"rewards/brier_reward": 0.8242184281349182,
"rewards/confidence_one_or_zero": 0.01025390625,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5095595717430115,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 490.2,
"completions/max_terminated_length": 490.2,
"completions/mean_length": 225.5041015625,
"completions/mean_terminated_length": 225.5041015625,
"completions/min_length": 107.4,
"completions/min_terminated_length": 107.4,
"epoch": 0.624,
"grad_norm": 0.0012107606744393706,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 662638158.0,
"reward": 1.1756139755249024,
"reward_std": 0.09611473232507706,
"rewards/accuracy_reward": 0.531640625,
"rewards/brier_reward": 0.8195863008499146,
"rewards/confidence_one_or_zero": 0.01044921875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.515623027086258,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 902.8,
"completions/max_terminated_length": 522.0,
"completions/mean_length": 229.905078125,
"completions/mean_terminated_length": 229.64991760253906,
"completions/min_length": 114.4,
"completions/min_terminated_length": 114.4,
"epoch": 0.64,
"grad_norm": 0.0014958431711420417,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 680335074.0,
"reward": 1.1860469579696655,
"reward_std": 0.09416615813970566,
"rewards/accuracy_reward": 0.562890625,
"rewards/brier_reward": 0.8094952344894409,
"rewards/confidence_one_or_zero": 0.01044921875,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.5647656202316285,
"step": 200
},
{
"epoch": 0.64,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 376.0,
"eval_completions/max_terminated_length": 376.0,
"eval_completions/mean_length": 231.17133712768555,
"eval_completions/mean_terminated_length": 231.17133712768555,
"eval_completions/min_length": 133.75,
"eval_completions/min_terminated_length": 133.75,
"eval_loss": 0.0,
"eval_num_tokens": 680335074.0,
"eval_reward": 1.113263338804245,
"eval_reward_std": 0.29021773487329483,
"eval_rewards/accuracy_reward": 0.43359375,
"eval_rewards/brier_reward": 0.7929318398237228,
"eval_rewards/confidence_one_or_zero": 0.009765625,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.5252539217472076,
"eval_runtime": 22.36,
"eval_samples_per_second": 22.361,
"eval_steps_per_second": 0.179,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 718.8,
"completions/max_terminated_length": 514.2,
"completions/mean_length": 229.0859375,
"completions/mean_terminated_length": 228.9575408935547,
"completions/min_length": 106.6,
"completions/min_terminated_length": 106.6,
"epoch": 0.656,
"grad_norm": 0.0010967873968183994,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 697537458.0,
"reward": 1.1436767816543578,
"reward_std": 0.10650671422481536,
"rewards/accuracy_reward": 0.500390625,
"rewards/brier_reward": 0.7871571063995362,
"rewards/confidence_one_or_zero": 0.00869140625,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5562148451805115,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 501.4,
"completions/max_terminated_length": 501.4,
"completions/mean_length": 230.0017578125,
"completions/mean_terminated_length": 230.0017578125,
"completions/min_length": 113.6,
"completions/min_terminated_length": 113.6,
"epoch": 0.672,
"grad_norm": 0.0009973476408049464,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 714806116.0,
"reward": 1.1648722887039185,
"reward_std": 0.0889286831021309,
"rewards/accuracy_reward": 0.51943359375,
"rewards/brier_reward": 0.8103100538253785,
"rewards/confidence_one_or_zero": 0.01181640625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.534494137763977,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 771.2,
"completions/max_terminated_length": 594.6,
"completions/mean_length": 229.80078125,
"completions/mean_terminated_length": 229.6729309082031,
"completions/min_length": 108.2,
"completions/min_terminated_length": 108.2,
"epoch": 0.688,
"grad_norm": 0.003850990440696478,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 732113196.0,
"reward": 1.170497512817383,
"reward_std": 0.09079546928405761,
"rewards/accuracy_reward": 0.5337890625,
"rewards/brier_reward": 0.8074003338813782,
"rewards/confidence_one_or_zero": 0.00693359375,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5142109453678131,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 434.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 224.29423828125,
"completions/mean_terminated_length": 224.29423828125,
"completions/min_length": 109.6,
"completions/min_terminated_length": 109.6,
"epoch": 0.704,
"grad_norm": 0.000887486501596868,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 749276113.0,
"reward": 1.1758257865905761,
"reward_std": 0.08791445046663285,
"rewards/accuracy_reward": 0.53662109375,
"rewards/brier_reward": 0.8150294065475464,
"rewards/confidence_one_or_zero": 0.010546875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5334873080253602,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.8,
"completions/max_terminated_length": 443.8,
"completions/mean_length": 221.7052734375,
"completions/mean_terminated_length": 221.7052734375,
"completions/min_length": 106.2,
"completions/min_terminated_length": 106.2,
"epoch": 0.72,
"grad_norm": 0.001327179721556604,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 766556231.0,
"reward": 1.1897984266281127,
"reward_std": 0.08169474899768829,
"rewards/accuracy_reward": 0.56044921875,
"rewards/brier_reward": 0.8191466093063354,
"rewards/confidence_one_or_zero": 0.00361328125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5215481758117676,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 440.6,
"completions/max_terminated_length": 440.6,
"completions/mean_length": 220.46484375,
"completions/mean_terminated_length": 220.46484375,
"completions/min_length": 108.6,
"completions/min_terminated_length": 108.6,
"epoch": 0.736,
"grad_norm": 0.0015731732128188014,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 783753375.0,
"reward": 1.1876837491989136,
"reward_std": 0.08695107698440552,
"rewards/accuracy_reward": 0.566015625,
"rewards/brier_reward": 0.8093507885932922,
"rewards/confidence_one_or_zero": 0.003515625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5321669936180115,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 644.6,
"completions/max_terminated_length": 438.8,
"completions/mean_length": 220.48974609375,
"completions/mean_terminated_length": 220.36173095703126,
"completions/min_length": 106.8,
"completions/min_terminated_length": 106.8,
"epoch": 0.752,
"grad_norm": 0.0010616250801831484,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 801238390.0,
"reward": 1.1859095811843872,
"reward_std": 0.08611637353897095,
"rewards/accuracy_reward": 0.561328125,
"rewards/brier_reward": 0.8105875492095947,
"rewards/confidence_one_or_zero": 0.00380859375,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5318828105926514,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 500.2,
"completions/max_terminated_length": 500.2,
"completions/mean_length": 230.7822265625,
"completions/mean_terminated_length": 230.7822265625,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.768,
"grad_norm": 0.0017465156270191073,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 818534304.0,
"reward": 1.1670047283172607,
"reward_std": 0.09165385216474534,
"rewards/accuracy_reward": 0.51474609375,
"rewards/brier_reward": 0.8192623376846313,
"rewards/confidence_one_or_zero": 0.003125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5243632674217225,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 461.4,
"completions/max_terminated_length": 461.4,
"completions/mean_length": 234.57412109375,
"completions/mean_terminated_length": 234.57412109375,
"completions/min_length": 124.4,
"completions/min_terminated_length": 124.4,
"epoch": 0.784,
"grad_norm": 0.0015328590525314212,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 836110711.0,
"reward": 1.1777088403701783,
"reward_std": 0.08737877309322357,
"rewards/accuracy_reward": 0.55556640625,
"rewards/brier_reward": 0.8000456333160401,
"rewards/confidence_one_or_zero": 0.00361328125,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5257382750511169,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.8,
"completions/max_terminated_length": 496.8,
"completions/mean_length": 237.66533203125,
"completions/mean_terminated_length": 237.66533203125,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.8,
"grad_norm": 0.0011416386114433408,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 853554964.0,
"reward": 1.2036036729812623,
"reward_std": 0.08387369066476821,
"rewards/accuracy_reward": 0.5857421875,
"rewards/brier_reward": 0.8214640617370605,
"rewards/confidence_one_or_zero": 0.0037109375,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5544834017753602,
"step": 250
},
{
"epoch": 0.8,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 385.75,
"eval_completions/max_terminated_length": 385.75,
"eval_completions/mean_length": 245.00457763671875,
"eval_completions/mean_terminated_length": 245.00457763671875,
"eval_completions/min_length": 139.0,
"eval_completions/min_terminated_length": 139.0,
"eval_loss": 0.0,
"eval_num_tokens": 853554964.0,
"eval_reward": 1.12277153134346,
"eval_reward_std": 0.28607048839330673,
"eval_rewards/accuracy_reward": 0.421875,
"eval_rewards/brier_reward": 0.8236669898033142,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.5200195461511612,
"eval_runtime": 22.5943,
"eval_samples_per_second": 22.129,
"eval_steps_per_second": 0.177,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 516.2,
"completions/max_terminated_length": 516.2,
"completions/mean_length": 238.09716796875,
"completions/mean_terminated_length": 238.09716796875,
"completions/min_length": 119.6,
"completions/min_terminated_length": 119.6,
"epoch": 0.816,
"grad_norm": 0.0013309334171935916,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 871092247.0,
"reward": 1.1905385255813599,
"reward_std": 0.09376283437013626,
"rewards/accuracy_reward": 0.58505859375,
"rewards/brier_reward": 0.7960173130035401,
"rewards/confidence_one_or_zero": 0.0037109375,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5715683579444886,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 518.4,
"completions/max_terminated_length": 518.4,
"completions/mean_length": 245.5568359375,
"completions/mean_terminated_length": 245.5568359375,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.832,
"grad_norm": 0.0011804981622844934,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 888615101.0,
"reward": 1.1889414310455322,
"reward_std": 0.08597695529460907,
"rewards/accuracy_reward": 0.55439453125,
"rewards/brier_reward": 0.8234872579574585,
"rewards/confidence_one_or_zero": 0.00224609375,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5678603529930115,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 555.8,
"completions/max_terminated_length": 555.8,
"completions/mean_length": 252.11484375,
"completions/mean_terminated_length": 252.11484375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.848,
"grad_norm": 0.001554572256281972,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 906211125.0,
"reward": 1.1674025774002075,
"reward_std": 0.0894822582602501,
"rewards/accuracy_reward": 0.52626953125,
"rewards/brier_reward": 0.8085344791412353,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5744921803474426,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 795.2,
"completions/max_terminated_length": 764.6,
"completions/mean_length": 257.57587890625,
"completions/mean_terminated_length": 257.45111694335935,
"completions/min_length": 134.2,
"completions/min_terminated_length": 134.2,
"epoch": 0.864,
"grad_norm": 0.002882245695218444,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 923835518.0,
"reward": 1.2002546310424804,
"reward_std": 0.08959609419107437,
"rewards/accuracy_reward": 0.5888671875,
"rewards/brier_reward": 0.8117385983467102,
"rewards/confidence_one_or_zero": 0.00087890625,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5740029335021972,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 777.2,
"completions/max_terminated_length": 575.4,
"completions/mean_length": 259.45830078125,
"completions/mean_terminated_length": 259.33391723632815,
"completions/min_length": 127.6,
"completions/min_terminated_length": 127.6,
"epoch": 0.88,
"grad_norm": 0.001367030548863113,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 941639443.0,
"reward": 1.1616749525070191,
"reward_std": 0.08919112980365754,
"rewards/accuracy_reward": 0.51142578125,
"rewards/brier_reward": 0.8120206832885742,
"rewards/confidence_one_or_zero": 0.00126953125,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5385517477989197,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 568.6,
"completions/max_terminated_length": 568.6,
"completions/mean_length": 261.7333984375,
"completions/mean_terminated_length": 261.7333984375,
"completions/min_length": 129.6,
"completions/min_terminated_length": 129.6,
"epoch": 0.896,
"grad_norm": 0.0019051535055041313,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 959430441.0,
"reward": 1.1740495443344117,
"reward_std": 0.08368157297372818,
"rewards/accuracy_reward": 0.5408203125,
"rewards/brier_reward": 0.8072776556015014,
"rewards/confidence_one_or_zero": 0.00068359375,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5425273656845093,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 786.4,
"completions/max_terminated_length": 605.6,
"completions/mean_length": 262.71474609375,
"completions/mean_terminated_length": 262.591015625,
"completions/min_length": 127.6,
"completions/min_terminated_length": 127.6,
"epoch": 0.912,
"grad_norm": 0.0032088656444102526,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 977171936.0,
"reward": 1.1776597261428834,
"reward_std": 0.0879664734005928,
"rewards/accuracy_reward": 0.54130859375,
"rewards/brier_reward": 0.8142051458358764,
"rewards/confidence_one_or_zero": 0.00048828125,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.5229726552963256,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 594.0,
"completions/max_terminated_length": 594.0,
"completions/mean_length": 255.93291015625,
"completions/mean_terminated_length": 255.93291015625,
"completions/min_length": 124.8,
"completions/min_terminated_length": 124.8,
"epoch": 0.928,
"grad_norm": 0.0023952668998390436,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 994819505.0,
"reward": 1.1611377239227294,
"reward_std": 0.08492133468389511,
"rewards/accuracy_reward": 0.5228515625,
"rewards/brier_reward": 0.7994229435920716,
"rewards/confidence_one_or_zero": 0.00078125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5169863283634186,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 775.0,
"completions/max_terminated_length": 598.2,
"completions/mean_length": 257.284765625,
"completions/mean_terminated_length": 257.160205078125,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.944,
"grad_norm": 0.001820826786570251,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 1012429525.0,
"reward": 1.173994493484497,
"reward_std": 0.08908755034208297,
"rewards/accuracy_reward": 0.53642578125,
"rewards/brier_reward": 0.8116598725318909,
"rewards/confidence_one_or_zero": 0.000390625,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5185712933540344,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 602.4,
"completions/max_terminated_length": 602.4,
"completions/mean_length": 259.2900390625,
"completions/mean_terminated_length": 259.2900390625,
"completions/min_length": 131.4,
"completions/min_terminated_length": 131.4,
"epoch": 0.96,
"grad_norm": 0.004218410234898329,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 1030024975.0,
"reward": 1.1701999187469483,
"reward_std": 0.07609933465719224,
"rewards/accuracy_reward": 0.5234375,
"rewards/brier_reward": 0.8172542452812195,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.5500380873680115,
"step": 300
},
{
"epoch": 0.96,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 430.5,
"eval_completions/max_terminated_length": 430.5,
"eval_completions/mean_length": 257.0390625,
"eval_completions/mean_terminated_length": 257.0390625,
"eval_completions/min_length": 164.25,
"eval_completions/min_terminated_length": 164.25,
"eval_loss": 0.0,
"eval_num_tokens": 1030024975.0,
"eval_reward": 1.1227055788040161,
"eval_reward_std": 0.2832227647304535,
"eval_rewards/accuracy_reward": 0.439453125,
"eval_rewards/brier_reward": 0.8059570342302322,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.5294921696186066,
"eval_runtime": 25.4806,
"eval_samples_per_second": 19.623,
"eval_steps_per_second": 0.157,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 605.2,
"completions/max_terminated_length": 605.2,
"completions/mean_length": 258.0880859375,
"completions/mean_terminated_length": 258.0880859375,
"completions/min_length": 130.4,
"completions/min_terminated_length": 130.4,
"epoch": 0.976,
"grad_norm": 0.0010340906446799636,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 1047528917.0,
"reward": 1.1855917692184448,
"reward_std": 0.07872401475906372,
"rewards/accuracy_reward": 0.5564453125,
"rewards/brier_reward": 0.8147371768951416,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5425136804580688,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 781.0,
"completions/max_terminated_length": 594.0,
"completions/mean_length": 255.09814453125,
"completions/mean_terminated_length": 254.97261962890624,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.992,
"grad_norm": 0.016297942027449608,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 1065269602.0,
"reward": 1.166081428527832,
"reward_std": 0.07803938686847686,
"rewards/accuracy_reward": 0.521484375,
"rewards/brier_reward": 0.8107750654220581,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.5243340075016022,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 579.0,
"completions/max_terminated_length": 579.0,
"completions/mean_length": 257.6454162597656,
"completions/mean_terminated_length": 257.6454162597656,
"completions/min_length": 129.5,
"completions/min_terminated_length": 129.5,
"epoch": 0.9984,
"num_tokens": 1072324341.0,
"reward": 1.1739696860313416,
"reward_std": 0.08398981019854546,
"rewards/accuracy_reward": 0.55712890625,
"rewards/brier_reward": 0.7908094227313995,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.5394628942012787,
"step": 312,
"total_flos": 0.0,
"train_loss": 0.004670050070182277,
"train_runtime": 88432.5734,
"train_samples_per_second": 0.226,
"train_steps_per_second": 0.004
}
],
"logging_steps": 5,
"max_steps": 312,
"num_input_tokens_seen": 1072324341,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}