Files
qwen2.5-7B-rlcr_g8_b384_math/trainer_state.json

2264 lines
85 KiB
JSON
Raw Permalink Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4992,
"eval_steps": 15,
"global_step": 78,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4063.0,
"completions/mean_length": 587.9622802734375,
"completions/mean_terminated_length": 518.0810546875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 0.00229196366854012,
"learning_rate": 3.125e-07,
"loss": 0.0123,
"num_tokens": 1316862.0,
"reward": 0.5813632607460022,
"reward_std": 0.5275046825408936,
"rewards/accuracy_reward": 0.2630208432674408,
"rewards/brier_reward": 0.3104891777038574,
"rewards/confidence_one_or_zero": 0.3606770932674408,
"rewards/format_reward": 0.5891926884651184,
"rewards/mean_confidence_reward": 0.826708972454071,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01497395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4050.0,
"completions/mean_length": 599.0475463867188,
"completions/mean_terminated_length": 545.8883056640625,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.0128,
"grad_norm": 0.002298542531207204,
"learning_rate": 6.25e-07,
"loss": 0.0196,
"num_tokens": 2652743.0,
"reward": 0.6089980006217957,
"reward_std": 0.5383763313293457,
"rewards/accuracy_reward": 0.2760416567325592,
"rewards/brier_reward": 0.32409217953681946,
"rewards/confidence_one_or_zero": 0.3411458432674408,
"rewards/format_reward": 0.6178385615348816,
"rewards/mean_confidence_reward": 0.8342577815055847,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01432291666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4091.0,
"completions/mean_length": 601.3822021484375,
"completions/mean_terminated_length": 550.6017456054688,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0192,
"grad_norm": 0.0022709358017891645,
"learning_rate": 9.375000000000001e-07,
"loss": 0.0136,
"num_tokens": 3996250.0,
"reward": 0.5906873941421509,
"reward_std": 0.5221948623657227,
"rewards/accuracy_reward": 0.2545572817325592,
"rewards/brier_reward": 0.3135129511356354,
"rewards/confidence_one_or_zero": 0.337890625,
"rewards/format_reward": 0.61328125,
"rewards/mean_confidence_reward": 0.8254140019416809,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3967.0,
"completions/mean_length": 565.1041870117188,
"completions/mean_terminated_length": 494.7676086425781,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0256,
"grad_norm": 0.0018688800046220422,
"learning_rate": 1.25e-06,
"loss": 0.0203,
"num_tokens": 5265922.0,
"reward": 0.613066554069519,
"reward_std": 0.5181887745857239,
"rewards/accuracy_reward": 0.2669270932674408,
"rewards/brier_reward": 0.32051023840904236,
"rewards/confidence_one_or_zero": 0.330078125,
"rewards/format_reward": 0.638671875,
"rewards/mean_confidence_reward": 0.859596312046051,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01822916666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4047.0,
"completions/mean_length": 580.2845458984375,
"completions/mean_terminated_length": 515.0059814453125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.032,
"grad_norm": 0.002923916792497039,
"learning_rate": 1.5625e-06,
"loss": 0.0191,
"num_tokens": 6570791.0,
"reward": 0.5732605457305908,
"reward_std": 0.5266666412353516,
"rewards/accuracy_reward": 0.23828125,
"rewards/brier_reward": 0.29949304461479187,
"rewards/confidence_one_or_zero": 0.3203125,
"rewards/format_reward": 0.6087239384651184,
"rewards/mean_confidence_reward": 0.8248130679130554,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4043.0,
"completions/mean_length": 581.8854370117188,
"completions/mean_terminated_length": 507.11700439453125,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.0384,
"grad_norm": 0.0017212529201060534,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.0107,
"num_tokens": 7879951.0,
"reward": 0.6507998704910278,
"reward_std": 0.4878300130367279,
"rewards/accuracy_reward": 0.2682291567325592,
"rewards/brier_reward": 0.3386862277984619,
"rewards/confidence_one_or_zero": 0.3072916567325592,
"rewards/format_reward": 0.6946614384651184,
"rewards/mean_confidence_reward": 0.8365259766578674,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02213541666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4067.0,
"completions/mean_length": 578.3861083984375,
"completions/mean_terminated_length": 498.7596435546875,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.0448,
"grad_norm": 0.010415066964924335,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.0281,
"num_tokens": 9185616.0,
"reward": 0.6887897253036499,
"reward_std": 0.4675048291683197,
"rewards/accuracy_reward": 0.279296875,
"rewards/brier_reward": 0.34956130385398865,
"rewards/confidence_one_or_zero": 0.3326822817325592,
"rewards/format_reward": 0.7486979365348816,
"rewards/mean_confidence_reward": 0.8381659984588623,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3929.0,
"completions/mean_length": 520.171875,
"completions/mean_terminated_length": 463.4126892089844,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0512,
"grad_norm": 0.002210509032011032,
"learning_rate": 2.5e-06,
"loss": 0.0182,
"num_tokens": 10394440.0,
"reward": 0.7613718509674072,
"reward_std": 0.44713592529296875,
"rewards/accuracy_reward": 0.2975260317325592,
"rewards/brier_reward": 0.38209569454193115,
"rewards/confidence_one_or_zero": 0.2805989682674408,
"rewards/format_reward": 0.8430989384651184,
"rewards/mean_confidence_reward": 0.8710243105888367,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01302083333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3597.0,
"completions/mean_length": 493.451171875,
"completions/mean_terminated_length": 445.9241638183594,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.0576,
"grad_norm": 0.001208790927194059,
"learning_rate": 2.8125e-06,
"loss": 0.0073,
"num_tokens": 11570397.0,
"reward": 0.8752974271774292,
"reward_std": 0.4269093871116638,
"rewards/accuracy_reward": 0.3815104067325592,
"rewards/brier_reward": 0.4621589183807373,
"rewards/confidence_one_or_zero": 0.3411458432674408,
"rewards/format_reward": 0.9069010615348816,
"rewards/mean_confidence_reward": 0.8819381594657898,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3662.0,
"completions/mean_length": 441.7220153808594,
"completions/mean_terminated_length": 415.36328125,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.064,
"grad_norm": 0.0010935115860775113,
"learning_rate": 3.125e-06,
"loss": 0.0015,
"num_tokens": 12670578.0,
"reward": 0.8601632118225098,
"reward_std": 0.39231395721435547,
"rewards/accuracy_reward": 0.3326822817325592,
"rewards/brier_reward": 0.4344954490661621,
"rewards/confidence_one_or_zero": 0.2962239682674408,
"rewards/format_reward": 0.953125,
"rewards/mean_confidence_reward": 0.8905722498893738,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00520833333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3880.0,
"completions/mean_length": 444.0716247558594,
"completions/mean_terminated_length": 424.9515686035156,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.0704,
"grad_norm": 0.000973426504060626,
"learning_rate": 3.4375e-06,
"loss": 0.0011,
"num_tokens": 13766424.0,
"reward": 0.9216434359550476,
"reward_std": 0.3478318154811859,
"rewards/accuracy_reward": 0.3717447817325592,
"rewards/brier_reward": 0.4884470999240875,
"rewards/confidence_one_or_zero": 0.220703125,
"rewards/format_reward": 0.9830729365348816,
"rewards/mean_confidence_reward": 0.8892440795898438,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2337.0,
"completions/mean_length": 468.76953125,
"completions/mean_terminated_length": 435.40472412109375,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.0768,
"grad_norm": 0.0008072754135355353,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0078,
"num_tokens": 14894670.0,
"reward": 0.9307572245597839,
"reward_std": 0.32634100317955017,
"rewards/accuracy_reward": 0.3763020932674408,
"rewards/brier_reward": 0.5001646876335144,
"rewards/confidence_one_or_zero": 0.1946614533662796,
"rewards/format_reward": 0.9850260615348816,
"rewards/mean_confidence_reward": 0.8821842074394226,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3957.0,
"completions/mean_length": 458.2786560058594,
"completions/mean_terminated_length": 427.22784423828125,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.0832,
"grad_norm": 0.0009474229882471263,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.0075,
"num_tokens": 16008858.0,
"reward": 1.0004254579544067,
"reward_std": 0.3447754979133606,
"rewards/accuracy_reward": 0.4557291567325592,
"rewards/brier_reward": 0.5620275139808655,
"rewards/confidence_one_or_zero": 0.1796875,
"rewards/format_reward": 0.9830729365348816,
"rewards/mean_confidence_reward": 0.878938615322113,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3721.0,
"completions/mean_length": 470.76434326171875,
"completions/mean_terminated_length": 437.4178466796875,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.0896,
"grad_norm": 0.0010997421341016889,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0066,
"num_tokens": 17144288.0,
"reward": 1.0370821952819824,
"reward_std": 0.3125256896018982,
"rewards/accuracy_reward": 0.4830729067325592,
"rewards/brier_reward": 0.6021392941474915,
"rewards/confidence_one_or_zero": 0.1302083283662796,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.8621549606323242,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3954.0,
"completions/mean_length": 492.4739685058594,
"completions/mean_terminated_length": 461.7150573730469,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.096,
"grad_norm": 0.00073769356822595,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.0056,
"num_tokens": 18320432.0,
"reward": 1.0514283180236816,
"reward_std": 0.2952514588832855,
"rewards/accuracy_reward": 0.49609375,
"rewards/brier_reward": 0.6191139817237854,
"rewards/confidence_one_or_zero": 0.083984375,
"rewards/format_reward": 0.9876301884651184,
"rewards/mean_confidence_reward": 0.8452711701393127,
"step": 15
},
{
"epoch": 0.096,
"eval_completions/clipped_ratio": 0.01412259615384616,
"eval_completions/max_length": 4017.25,
"eval_completions/max_terminated_length": 2676.0,
"eval_completions/mean_length": 537.4269104003906,
"eval_completions/mean_terminated_length": 486.3598976135254,
"eval_completions/min_length": 137.875,
"eval_completions/min_terminated_length": 137.875,
"eval_loss": 0.0,
"eval_num_tokens": 18320432.0,
"eval_reward": 1.057287573814392,
"eval_reward_std": 0.43193066120147705,
"eval_rewards/accuracy_reward": 0.4990234375,
"eval_rewards/brier_reward": 0.6321354508399963,
"eval_rewards/confidence_one_or_zero": 0.0654296875,
"eval_rewards/format_reward": 0.9833984375,
"eval_rewards/mean_confidence_reward": 0.8258691281080246,
"eval_runtime": 266.5804,
"eval_samples_per_second": 3.751,
"eval_steps_per_second": 0.03,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3892.0,
"completions/mean_length": 499.75653076171875,
"completions/mean_terminated_length": 471.43963623046875,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.1024,
"grad_norm": 0.000677171687129885,
"learning_rate": 5e-06,
"loss": 0.0082,
"num_tokens": 19507706.0,
"reward": 1.1004698276519775,
"reward_std": 0.28478488326072693,
"rewards/accuracy_reward": 0.5462239384651184,
"rewards/brier_reward": 0.6664164066314697,
"rewards/confidence_one_or_zero": 0.0631510391831398,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.8320556282997131,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01236979166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4085.0,
"completions/mean_length": 554.9486083984375,
"completions/mean_terminated_length": 510.5978698730469,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.1088,
"grad_norm": 0.0006332960911095142,
"learning_rate": 4.919354838709678e-06,
"loss": 0.0081,
"num_tokens": 20777075.0,
"reward": 1.048607349395752,
"reward_std": 0.2772809863090515,
"rewards/accuracy_reward": 0.4791666567325592,
"rewards/brier_reward": 0.6343072056770325,
"rewards/confidence_one_or_zero": 0.0338541679084301,
"rewards/format_reward": 0.9837239384651184,
"rewards/mean_confidence_reward": 0.8093522191047668,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01041666666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3753.0,
"completions/mean_length": 577.5267333984375,
"completions/mean_terminated_length": 540.4901733398438,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1152,
"grad_norm": 0.0005637197173200548,
"learning_rate": 4.838709677419355e-06,
"loss": 0.0167,
"num_tokens": 22072396.0,
"reward": 1.126558780670166,
"reward_std": 0.26943597197532654,
"rewards/accuracy_reward": 0.5677083134651184,
"rewards/brier_reward": 0.6971117854118347,
"rewards/confidence_one_or_zero": 0.014973958022892475,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.7954492568969727,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3794.0,
"completions/mean_length": 599.7174682617188,
"completions/mean_terminated_length": 551.254150390625,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.1216,
"grad_norm": 0.0004333103133831173,
"learning_rate": 4.758064516129033e-06,
"loss": 0.0161,
"num_tokens": 23407450.0,
"reward": 1.1329493522644043,
"reward_std": 0.2181943953037262,
"rewards/accuracy_reward": 0.5735676884651184,
"rewards/brier_reward": 0.7066380381584167,
"rewards/confidence_one_or_zero": 0.009114583022892475,
"rewards/format_reward": 0.9856770634651184,
"rewards/mean_confidence_reward": 0.7850651741027832,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4062.0,
"completions/mean_length": 630.1165771484375,
"completions/mean_terminated_length": 589.0191040039062,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.128,
"grad_norm": 0.0005656772991642356,
"learning_rate": 4.67741935483871e-06,
"loss": 0.0139,
"num_tokens": 24786269.0,
"reward": 1.1472342014312744,
"reward_std": 0.21497444808483124,
"rewards/accuracy_reward": 0.5859375,
"rewards/brier_reward": 0.7221875190734863,
"rewards/confidence_one_or_zero": 0.0013020833721384406,
"rewards/format_reward": 0.986328125,
"rewards/mean_confidence_reward": 0.7606358528137207,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01497395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3963.0,
"completions/mean_length": 713.6373901367188,
"completions/mean_terminated_length": 662.2200927734375,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.1344,
"grad_norm": 0.0004828801902476698,
"learning_rate": 4.596774193548387e-06,
"loss": 0.0197,
"num_tokens": 26296008.0,
"reward": 1.1559076309204102,
"reward_std": 0.24854782223701477,
"rewards/accuracy_reward": 0.6009114384651184,
"rewards/brier_reward": 0.7297688126564026,
"rewards/confidence_one_or_zero": 0.0006510416860692203,
"rewards/format_reward": 0.9811198115348816,
"rewards/mean_confidence_reward": 0.7453190684318542,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01497395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3628.0,
"completions/mean_length": 667.09375,
"completions/mean_terminated_length": 614.9689331054688,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.1408,
"grad_norm": 0.035328447818756104,
"learning_rate": 4.516129032258065e-06,
"loss": 0.0153,
"num_tokens": 27729288.0,
"reward": 1.1439964771270752,
"reward_std": 0.21810662746429443,
"rewards/accuracy_reward": 0.5904948115348816,
"rewards/brier_reward": 0.7150607705116272,
"rewards/confidence_one_or_zero": 0.0045572915114462376,
"rewards/format_reward": 0.982421875,
"rewards/mean_confidence_reward": 0.7736979126930237,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4079.0,
"completions/mean_length": 759.8724365234375,
"completions/mean_terminated_length": 693.4157104492188,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.1472,
"grad_norm": 0.00043078220915049314,
"learning_rate": 4.435483870967742e-06,
"loss": 0.0252,
"num_tokens": 29316252.0,
"reward": 1.1476705074310303,
"reward_std": 0.24656717479228973,
"rewards/accuracy_reward": 0.5885416865348816,
"rewards/brier_reward": 0.7308734059333801,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9759114384651184,
"rewards/mean_confidence_reward": 0.7125194668769836,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4017.0,
"completions/mean_length": 693.0065307617188,
"completions/mean_terminated_length": 663.9592895507812,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1536,
"grad_norm": 0.0003661557857412845,
"learning_rate": 4.35483870967742e-06,
"loss": 0.0141,
"num_tokens": 30791358.0,
"reward": 1.1998891830444336,
"reward_std": 0.19935479760169983,
"rewards/accuracy_reward": 0.6471354365348816,
"rewards/brier_reward": 0.7636963725090027,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.7080404162406921,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01692708333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4061.0,
"completions/mean_length": 710.5794677734375,
"completions/mean_terminated_length": 652.2874145507812,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.16,
"grad_norm": 0.0003608058614190668,
"learning_rate": 4.274193548387097e-06,
"loss": 0.0155,
"num_tokens": 32289752.0,
"reward": 1.1741443872451782,
"reward_std": 0.20154307782649994,
"rewards/accuracy_reward": 0.6197916865348816,
"rewards/brier_reward": 0.747363269329071,
"rewards/confidence_one_or_zero": 0.0013020833721384406,
"rewards/format_reward": 0.9811198115348816,
"rewards/mean_confidence_reward": 0.6910156607627869,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01432291666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3953.0,
"completions/mean_length": 720.763671875,
"completions/mean_terminated_length": 671.718017578125,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.1664,
"grad_norm": 0.00038824151852168143,
"learning_rate": 4.193548387096774e-06,
"loss": 0.0144,
"num_tokens": 33811813.0,
"reward": 1.1451948881149292,
"reward_std": 0.1946631371974945,
"rewards/accuracy_reward": 0.5755208134651184,
"rewards/brier_reward": 0.7324332594871521,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.982421875,
"rewards/mean_confidence_reward": 0.6781575679779053,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01497395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4023.0,
"completions/mean_length": 702.4681396484375,
"completions/mean_terminated_length": 650.8810424804688,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.1728,
"grad_norm": 0.0003422526060603559,
"learning_rate": 4.112903225806452e-06,
"loss": 0.0196,
"num_tokens": 35296276.0,
"reward": 1.2147198915481567,
"reward_std": 0.1851186752319336,
"rewards/accuracy_reward": 0.6751301884651184,
"rewards/brier_reward": 0.7725251317024231,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9817708134651184,
"rewards/mean_confidence_reward": 0.6732291579246521,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01302083333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3945.0,
"completions/mean_length": 758.5983276367188,
"completions/mean_terminated_length": 714.5692749023438,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.1792,
"grad_norm": 0.0003135671140626073,
"learning_rate": 4.032258064516129e-06,
"loss": 0.018,
"num_tokens": 36873307.0,
"reward": 1.2059857845306396,
"reward_std": 0.183271586894989,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.7674267888069153,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.6653971672058105,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4058.0,
"completions/mean_length": 756.6393432617188,
"completions/mean_terminated_length": 725.9224243164062,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.1856,
"grad_norm": 0.00033678373438306153,
"learning_rate": 3.951612903225807e-06,
"loss": 0.0113,
"num_tokens": 38440297.0,
"reward": 1.204465627670288,
"reward_std": 0.16857890784740448,
"rewards/accuracy_reward": 0.654296875,
"rewards/brier_reward": 0.7656884789466858,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.6629883050918579,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3894.0,
"completions/mean_length": 784.548828125,
"completions/mean_terminated_length": 738.6475219726562,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.192,
"grad_norm": 0.00036860810359939933,
"learning_rate": 3.870967741935484e-06,
"loss": 0.0127,
"num_tokens": 40054092.0,
"reward": 1.1792367696762085,
"reward_std": 0.18021389842033386,
"rewards/accuracy_reward": 0.62109375,
"rewards/brier_reward": 0.7542936205863953,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9830729365348816,
"rewards/mean_confidence_reward": 0.6539713740348816,
"step": 30
},
{
"epoch": 0.192,
"eval_completions/clipped_ratio": 0.020207331730769232,
"eval_completions/max_length": 4084.875,
"eval_completions/max_terminated_length": 2403.25,
"eval_completions/mean_length": 779.5079650878906,
"eval_completions/mean_terminated_length": 710.9605026245117,
"eval_completions/min_length": 284.625,
"eval_completions/min_terminated_length": 284.625,
"eval_loss": 0.0,
"eval_num_tokens": 40054092.0,
"eval_reward": 1.178975060582161,
"eval_reward_std": 0.3463897779583931,
"eval_rewards/accuracy_reward": 0.625,
"eval_rewards/brier_reward": 0.7544213905930519,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.978515625,
"eval_rewards/mean_confidence_reward": 0.6520995870232582,
"eval_runtime": 278.501,
"eval_samples_per_second": 3.591,
"eval_steps_per_second": 0.029,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01041666666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4079.0,
"completions/mean_length": 741.806640625,
"completions/mean_terminated_length": 706.4993896484375,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.1984,
"grad_norm": 0.0005164485774002969,
"learning_rate": 3.7903225806451614e-06,
"loss": 0.0135,
"num_tokens": 41608667.0,
"reward": 1.2111461162567139,
"reward_std": 0.1950198858976364,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.767981767654419,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.986328125,
"rewards/mean_confidence_reward": 0.6626952886581421,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3971.0,
"completions/mean_length": 800.8229370117188,
"completions/mean_terminated_length": 748.5184936523438,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.2048,
"grad_norm": 0.0003207188274245709,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.0187,
"num_tokens": 43245683.0,
"reward": 1.1906991004943848,
"reward_std": 0.18316665291786194,
"rewards/accuracy_reward": 0.6399739384651184,
"rewards/brier_reward": 0.7602912783622742,
"rewards/confidence_one_or_zero": 0.0006510416860692203,
"rewards/format_reward": 0.9811198115348816,
"rewards/mean_confidence_reward": 0.6509439945220947,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4058.0,
"completions/mean_length": 734.4603271484375,
"completions/mean_terminated_length": 694.6001586914062,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.2112,
"grad_norm": 0.0003439242427702993,
"learning_rate": 3.6290322580645166e-06,
"loss": 0.014,
"num_tokens": 44783254.0,
"reward": 1.1869686841964722,
"reward_std": 0.18534022569656372,
"rewards/accuracy_reward": 0.6321614384651184,
"rewards/brier_reward": 0.7567365765571594,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9850260615348816,
"rewards/mean_confidence_reward": 0.6561523079872131,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4033.0,
"completions/mean_length": 749.0651245117188,
"completions/mean_terminated_length": 729.3385620117188,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.2176,
"grad_norm": 0.0005052844644524157,
"learning_rate": 3.548387096774194e-06,
"loss": 0.005,
"num_tokens": 46343578.0,
"reward": 1.1979600191116333,
"reward_std": 0.16698740422725677,
"rewards/accuracy_reward": 0.63671875,
"rewards/brier_reward": 0.765698254108429,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6605142951011658,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0026041666666666297,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4060.0,
"completions/mean_length": 719.84375,
"completions/mean_terminated_length": 711.0287475585938,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.224,
"grad_norm": 0.0003193453885614872,
"learning_rate": 3.4677419354838714e-06,
"loss": 0.0049,
"num_tokens": 47861770.0,
"reward": 1.2130991220474243,
"reward_std": 0.15396082401275635,
"rewards/accuracy_reward": 0.6575520634651184,
"rewards/brier_reward": 0.7725391387939453,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6565755009651184,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4039.0,
"completions/mean_length": 757.34375,
"completions/mean_terminated_length": 728.845703125,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.2304,
"grad_norm": 0.00032273278338834643,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.0081,
"num_tokens": 49439394.0,
"reward": 1.208127498626709,
"reward_std": 0.16208727657794952,
"rewards/accuracy_reward": 0.6575520634651184,
"rewards/brier_reward": 0.7684553265571594,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.6528971195220947,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3426.0,
"completions/mean_length": 773.3659057617188,
"completions/mean_terminated_length": 727.3095703125,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.2368,
"grad_norm": 0.00034387107007205486,
"learning_rate": 3.306451612903226e-06,
"loss": 0.0111,
"num_tokens": 51037868.0,
"reward": 1.201973557472229,
"reward_std": 0.17515644431114197,
"rewards/accuracy_reward": 0.6555989384651184,
"rewards/brier_reward": 0.764611005783081,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9837239384651184,
"rewards/mean_confidence_reward": 0.6488932371139526,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01236979166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4070.0,
"completions/mean_length": 754.919921875,
"completions/mean_terminated_length": 713.0737915039062,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.2432,
"grad_norm": 0.00033718085614964366,
"learning_rate": 3.225806451612903e-06,
"loss": 0.0094,
"num_tokens": 52609993.0,
"reward": 1.2070956230163574,
"reward_std": 0.1585657298564911,
"rewards/accuracy_reward": 0.6588541865348816,
"rewards/brier_reward": 0.7709488868713379,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.643261730670929,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01106770833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3937.0,
"completions/mean_length": 787.2845458984375,
"completions/mean_terminated_length": 750.2547607421875,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.2496,
"grad_norm": 0.00032993199420161545,
"learning_rate": 3.145161290322581e-06,
"loss": 0.0113,
"num_tokens": 54228542.0,
"reward": 1.1909804344177246,
"reward_std": 0.17440544068813324,
"rewards/accuracy_reward": 0.6373698115348816,
"rewards/brier_reward": 0.759552001953125,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9850260615348816,
"rewards/mean_confidence_reward": 0.643287718296051,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4082.0,
"completions/mean_length": 774.5560302734375,
"completions/mean_terminated_length": 750.5980224609375,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.256,
"grad_norm": 0.00032635065144859254,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.0109,
"num_tokens": 55832548.0,
"reward": 1.203762173652649,
"reward_std": 0.17923477292060852,
"rewards/accuracy_reward": 0.6490885615348816,
"rewards/brier_reward": 0.7694905400276184,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.6456055045127869,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4061.0,
"completions/mean_length": 762.052734375,
"completions/mean_terminated_length": 735.8012084960938,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.2624,
"grad_norm": 0.0003047576465178281,
"learning_rate": 2.983870967741936e-06,
"loss": 0.008,
"num_tokens": 57414589.0,
"reward": 1.2368627786636353,
"reward_std": 0.1556222140789032,
"rewards/accuracy_reward": 0.6959635615348816,
"rewards/brier_reward": 0.7862125039100647,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9915364384651184,
"rewards/mean_confidence_reward": 0.6431315541267395,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01236979166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3963.0,
"completions/mean_length": 753.7682495117188,
"completions/mean_terminated_length": 711.90771484375,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.2688,
"grad_norm": 0.00031608319841325283,
"learning_rate": 2.903225806451613e-06,
"loss": 0.0118,
"num_tokens": 58983569.0,
"reward": 1.2014753818511963,
"reward_std": 0.15760651230812073,
"rewards/accuracy_reward": 0.6510416865348816,
"rewards/brier_reward": 0.7662191390991211,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9856770634651184,
"rewards/mean_confidence_reward": 0.6426758170127869,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01236979166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3037.0,
"completions/mean_length": 762.0319213867188,
"completions/mean_terminated_length": 720.2748413085938,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.2752,
"grad_norm": 0.0003945046046283096,
"learning_rate": 2.822580645161291e-06,
"loss": 0.0117,
"num_tokens": 60567154.0,
"reward": 1.1954646110534668,
"reward_std": 0.17542928457260132,
"rewards/accuracy_reward": 0.6373698115348816,
"rewards/brier_reward": 0.7659163475036621,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9876301884651184,
"rewards/mean_confidence_reward": 0.6377930045127869,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3965.0,
"completions/mean_length": 721.765625,
"completions/mean_terminated_length": 692.9639282226562,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.2816,
"grad_norm": 0.00038879967178218067,
"learning_rate": 2.7419354838709676e-06,
"loss": 0.0063,
"num_tokens": 62088242.0,
"reward": 1.22244393825531,
"reward_std": 0.1557161509990692,
"rewards/accuracy_reward": 0.6764323115348816,
"rewards/brier_reward": 0.7775569558143616,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.6463867425918579,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3411.0,
"completions/mean_length": 750.142578125,
"completions/mean_terminated_length": 717.14599609375,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.288,
"grad_norm": 0.000633600982837379,
"learning_rate": 2.6612903225806454e-06,
"loss": 0.0088,
"num_tokens": 63650541.0,
"reward": 1.19394052028656,
"reward_std": 0.15082021057605743,
"rewards/accuracy_reward": 0.6302083134651184,
"rewards/brier_reward": 0.7674251198768616,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.6472656726837158,
"step": 45
},
{
"epoch": 0.288,
"eval_completions/clipped_ratio": 0.0078125,
"eval_completions/max_length": 3459.625,
"eval_completions/max_terminated_length": 2331.375,
"eval_completions/mean_length": 745.1733016967773,
"eval_completions/mean_terminated_length": 718.9744033813477,
"eval_completions/min_length": 278.5,
"eval_completions/min_terminated_length": 278.5,
"eval_loss": 0.0,
"eval_num_tokens": 63650541.0,
"eval_reward": 1.21938157081604,
"eval_reward_std": 0.31588873267173767,
"eval_rewards/accuracy_reward": 0.6689453125,
"eval_rewards/brier_reward": 0.7785937488079071,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9912109375,
"eval_rewards/mean_confidence_reward": 0.6538085639476776,
"eval_runtime": 232.4576,
"eval_samples_per_second": 4.302,
"eval_steps_per_second": 0.034,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3526.0,
"completions/mean_length": 693.5755615234375,
"completions/mean_terminated_length": 680.2327270507812,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.2944,
"grad_norm": 0.0003804862208198756,
"learning_rate": 2.580645161290323e-06,
"loss": 0.0061,
"num_tokens": 65133321.0,
"reward": 1.2144540548324585,
"reward_std": 0.1589406430721283,
"rewards/accuracy_reward": 0.658203125,
"rewards/brier_reward": 0.7752490043640137,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.6563476920127869,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01822916666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3405.0,
"completions/mean_length": 785.98046875,
"completions/mean_terminated_length": 724.5211791992188,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.3008,
"grad_norm": 0.00043931242544203997,
"learning_rate": 2.5e-06,
"loss": 0.0167,
"num_tokens": 66754283.0,
"reward": 1.2090681791305542,
"reward_std": 0.1811358779668808,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/brier_reward": 0.770336925983429,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9811198115348816,
"rewards/mean_confidence_reward": 0.64208984375,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3563.0,
"completions/mean_length": 806.318359375,
"completions/mean_terminated_length": 760.7188110351562,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.3072,
"grad_norm": 0.0003429017961025238,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.0111,
"num_tokens": 68411652.0,
"reward": 1.1699422597885132,
"reward_std": 0.18925714492797852,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.7526318430900574,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9856770634651184,
"rewards/mean_confidence_reward": 0.6504232287406921,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3093.0,
"completions/mean_length": 725.8483276367188,
"completions/mean_terminated_length": 705.9849243164062,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.3136,
"grad_norm": 0.0005808745627291501,
"learning_rate": 2.338709677419355e-06,
"loss": 0.0078,
"num_tokens": 69942339.0,
"reward": 1.2273976802825928,
"reward_std": 0.1579420566558838,
"rewards/accuracy_reward": 0.6809895634651184,
"rewards/brier_reward": 0.7796516418457031,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.6576171517372131,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3871.0,
"completions/mean_length": 707.404296875,
"completions/mean_terminated_length": 687.4322509765625,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.32,
"grad_norm": 0.0003494632837828249,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.0075,
"num_tokens": 71440024.0,
"reward": 1.2161612510681152,
"reward_std": 0.15466436743736267,
"rewards/accuracy_reward": 0.6614583134651184,
"rewards/brier_reward": 0.7773616909980774,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.64990234375,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4033.0,
"completions/mean_length": 710.1224365234375,
"completions/mean_terminated_length": 681.2213134765625,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.3264,
"grad_norm": 0.0006095463177189231,
"learning_rate": 2.17741935483871e-06,
"loss": 0.0083,
"num_tokens": 72939964.0,
"reward": 1.2039477825164795,
"reward_std": 0.1723545491695404,
"rewards/accuracy_reward": 0.6458333134651184,
"rewards/brier_reward": 0.7711637616157532,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.6484701037406921,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3985.0,
"completions/mean_length": 695.3828125,
"completions/mean_terminated_length": 670.853759765625,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.3328,
"grad_norm": 0.0003877031267620623,
"learning_rate": 2.096774193548387e-06,
"loss": 0.0083,
"num_tokens": 74424472.0,
"reward": 1.2081983089447021,
"reward_std": 0.1538151502609253,
"rewards/accuracy_reward": 0.6536458134651184,
"rewards/brier_reward": 0.7712011337280273,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9915364384651184,
"rewards/mean_confidence_reward": 0.650390625,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3727.0,
"completions/mean_length": 706.1061401367188,
"completions/mean_terminated_length": 692.8124389648438,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.3392,
"grad_norm": 0.00037506147054955363,
"learning_rate": 2.0161290322580646e-06,
"loss": 0.0051,
"num_tokens": 75917531.0,
"reward": 1.2101529836654663,
"reward_std": 0.15496405959129333,
"rewards/accuracy_reward": 0.6510416865348816,
"rewards/brier_reward": 0.7731575965881348,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6464192867279053,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2506.0,
"completions/mean_length": 675.6868896484375,
"completions/mean_terminated_length": 662.2738647460938,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.3456,
"grad_norm": 0.0005340283387340605,
"learning_rate": 1.935483870967742e-06,
"loss": 0.0052,
"num_tokens": 77366122.0,
"reward": 1.2335808277130127,
"reward_std": 0.12662772834300995,
"rewards/accuracy_reward": 0.6881510615348816,
"rewards/brier_reward": 0.7829036712646484,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6430338025093079,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3214.0,
"completions/mean_length": 704.0651245117188,
"completions/mean_terminated_length": 692.987548828125,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.352,
"grad_norm": 0.0003404791059438139,
"learning_rate": 1.8548387096774196e-06,
"loss": 0.0043,
"num_tokens": 78861758.0,
"reward": 1.2172143459320068,
"reward_std": 0.11808693408966064,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.777514636516571,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9967448115348816,
"rewards/mean_confidence_reward": 0.6418294310569763,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3948.0,
"completions/mean_length": 744.0364990234375,
"completions/mean_terminated_length": 724.2802734375,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.3584,
"grad_norm": 0.0007370049715973437,
"learning_rate": 1.774193548387097e-06,
"loss": 0.0055,
"num_tokens": 80419182.0,
"reward": 1.2309666872024536,
"reward_std": 0.16057714819908142,
"rewards/accuracy_reward": 0.6842448115348816,
"rewards/brier_reward": 0.7841861844062805,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6373047232627869,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2338.0,
"completions/mean_length": 670.259765625,
"completions/mean_terminated_length": 647.8106079101562,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.3648,
"grad_norm": 0.00045788983698002994,
"learning_rate": 1.6935483870967742e-06,
"loss": 0.0076,
"num_tokens": 81861365.0,
"reward": 1.2649039030075073,
"reward_std": 0.1364418864250183,
"rewards/accuracy_reward": 0.740234375,
"rewards/brier_reward": 0.7960709929466248,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6371744871139526,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013020833333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3940.0,
"completions/mean_length": 693.5612182617188,
"completions/mean_terminated_length": 689.1251220703125,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.3712,
"grad_norm": 0.0005753524601459503,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.0031,
"num_tokens": 83342299.0,
"reward": 1.251284122467041,
"reward_std": 0.14745217561721802,
"rewards/accuracy_reward": 0.7135416865348816,
"rewards/brier_reward": 0.7903158068656921,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9986979365348816,
"rewards/mean_confidence_reward": 0.6346353888511658,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00455729166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3458.0,
"completions/mean_length": 674.2213745117188,
"completions/mean_terminated_length": 658.555908203125,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.3776,
"grad_norm": 0.0003959361056331545,
"learning_rate": 1.5322580645161292e-06,
"loss": 0.0041,
"num_tokens": 84788295.0,
"reward": 1.2028402090072632,
"reward_std": 0.14612236618995667,
"rewards/accuracy_reward": 0.6399739384651184,
"rewards/brier_reward": 0.7702505588531494,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.6440039277076721,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3363.0,
"completions/mean_length": 687.2760620117188,
"completions/mean_terminated_length": 658.179931640625,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.384,
"grad_norm": 0.00045462019625119865,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.007,
"num_tokens": 86257199.0,
"reward": 1.214907169342041,
"reward_std": 0.15338455140590668,
"rewards/accuracy_reward": 0.6653645634651184,
"rewards/brier_reward": 0.7735514640808105,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.6363281607627869,
"step": 60
},
{
"epoch": 0.384,
"eval_completions/clipped_ratio": 0.0029296875,
"eval_completions/max_length": 2977.75,
"eval_completions/max_terminated_length": 2288.25,
"eval_completions/mean_length": 679.1435546875,
"eval_completions/mean_terminated_length": 669.0825271606445,
"eval_completions/min_length": 242.75,
"eval_completions/min_terminated_length": 242.75,
"eval_loss": 0.0,
"eval_num_tokens": 86257199.0,
"eval_reward": 1.2179948389530182,
"eval_reward_std": 0.30138808116316795,
"eval_rewards/accuracy_reward": 0.662109375,
"eval_rewards/brier_reward": 0.7767968773841858,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9970703125,
"eval_rewards/mean_confidence_reward": 0.6503905951976776,
"eval_runtime": 200.9214,
"eval_samples_per_second": 4.977,
"eval_steps_per_second": 0.04,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01106770833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4076.0,
"completions/mean_length": 725.111328125,
"completions/mean_terminated_length": 687.3857421875,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.3904,
"grad_norm": 0.0005666792858392,
"learning_rate": 1.3709677419354838e-06,
"loss": 0.0108,
"num_tokens": 87793234.0,
"reward": 1.1888753175735474,
"reward_std": 0.1578804850578308,
"rewards/accuracy_reward": 0.626953125,
"rewards/brier_reward": 0.7625033259391785,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.6428385376930237,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4079.0,
"completions/mean_length": 704.2174682617188,
"completions/mean_terminated_length": 693.140380859375,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.3968,
"grad_norm": 0.0003873308305628598,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.0032,
"num_tokens": 89288952.0,
"reward": 1.2213973999023438,
"reward_std": 0.15189020335674286,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.779370129108429,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.6493815183639526,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3277.0,
"completions/mean_length": 681.048828125,
"completions/mean_terminated_length": 658.67041015625,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.4032,
"grad_norm": 0.0003833891823887825,
"learning_rate": 1.2096774193548388e-06,
"loss": 0.0069,
"num_tokens": 90747043.0,
"reward": 1.2279908657073975,
"reward_std": 0.13211821019649506,
"rewards/accuracy_reward": 0.6790364384651184,
"rewards/brier_reward": 0.7834423184394836,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6490560173988342,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3750.0,
"completions/mean_length": 703.8782958984375,
"completions/mean_terminated_length": 683.8853759765625,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.4096,
"grad_norm": 0.00041995308129116893,
"learning_rate": 1.1290322580645162e-06,
"loss": 0.0057,
"num_tokens": 92239016.0,
"reward": 1.2233237028121948,
"reward_std": 0.15987294912338257,
"rewards/accuracy_reward": 0.6751301884651184,
"rewards/brier_reward": 0.7780142426490784,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6548828482627869,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3156.0,
"completions/mean_length": 684.9453125,
"completions/mean_terminated_length": 671.5686645507812,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.416,
"grad_norm": 0.0003942767216358334,
"learning_rate": 1.0483870967741936e-06,
"loss": 0.0035,
"num_tokens": 93708572.0,
"reward": 1.244889497756958,
"reward_std": 0.13460424542427063,
"rewards/accuracy_reward": 0.7037760615348816,
"rewards/brier_reward": 0.7905468940734863,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.6529297232627869,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00455729166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2739.0,
"completions/mean_length": 692.6686401367188,
"completions/mean_terminated_length": 677.0875854492188,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.4224,
"grad_norm": 0.0004131811438128352,
"learning_rate": 9.67741935483871e-07,
"loss": 0.0056,
"num_tokens": 95184503.0,
"reward": 1.2223446369171143,
"reward_std": 0.14547114074230194,
"rewards/accuracy_reward": 0.673828125,
"rewards/brier_reward": 0.77540522813797,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.6509440541267395,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2844.0,
"completions/mean_length": 680.1510620117188,
"completions/mean_terminated_length": 660.018310546875,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.4288,
"grad_norm": 0.0007328905630856752,
"learning_rate": 8.870967741935485e-07,
"loss": 0.006,
"num_tokens": 96639215.0,
"reward": 1.2431633472442627,
"reward_std": 0.15597683191299438,
"rewards/accuracy_reward": 0.6998698115348816,
"rewards/brier_reward": 0.7923030853271484,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.6509439945220947,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3798.0,
"completions/mean_length": 676.5755615234375,
"completions/mean_terminated_length": 665.408203125,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.4352,
"grad_norm": 0.00044186998275108635,
"learning_rate": 8.064516129032258e-07,
"loss": 0.0052,
"num_tokens": 98093275.0,
"reward": 1.2231853008270264,
"reward_std": 0.1525024026632309,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.7816438674926758,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9967448115348816,
"rewards/mean_confidence_reward": 0.6505208611488342,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3791.0,
"completions/mean_length": 694.0651245117188,
"completions/mean_terminated_length": 671.77197265625,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.4416,
"grad_norm": 0.00043534423457458615,
"learning_rate": 7.258064516129033e-07,
"loss": 0.0063,
"num_tokens": 99573463.0,
"reward": 1.2070882320404053,
"reward_std": 0.1573706716299057,
"rewards/accuracy_reward": 0.6490885615348816,
"rewards/brier_reward": 0.7722363471984863,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9928385615348816,
"rewards/mean_confidence_reward": 0.6500000357627869,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3028.0,
"completions/mean_length": 685.0462646484375,
"completions/mean_terminated_length": 673.9065551757812,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.448,
"grad_norm": 0.0004712227964773774,
"learning_rate": 6.451612903225807e-07,
"loss": 0.0032,
"num_tokens": 101040030.0,
"reward": 1.2174032926559448,
"reward_std": 0.15992116928100586,
"rewards/accuracy_reward": 0.6627604365348816,
"rewards/brier_reward": 0.7752881050109863,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9967448115348816,
"rewards/mean_confidence_reward": 0.65478515625,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3558.0,
"completions/mean_length": 680.4296875,
"completions/mean_terminated_length": 669.2749633789062,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.4544,
"grad_norm": 0.000348559144185856,
"learning_rate": 5.645161290322581e-07,
"loss": 0.0062,
"num_tokens": 102493714.0,
"reward": 1.252179503440857,
"reward_std": 0.13250502943992615,
"rewards/accuracy_reward": 0.71484375,
"rewards/brier_reward": 0.7927572131156921,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9967448115348816,
"rewards/mean_confidence_reward": 0.6566406488418579,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00455729166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3703.0,
"completions/mean_length": 669.994140625,
"completions/mean_terminated_length": 654.309326171875,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.4608,
"grad_norm": 0.0006249594734981656,
"learning_rate": 4.838709677419355e-07,
"loss": 0.0041,
"num_tokens": 103937737.0,
"reward": 1.206282615661621,
"reward_std": 0.16145284473896027,
"rewards/accuracy_reward": 0.6451823115348816,
"rewards/brier_reward": 0.7719270586967468,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.653124988079071,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00520833333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3851.0,
"completions/mean_length": 710.1790771484375,
"completions/mean_terminated_length": 692.4522094726562,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.4672,
"grad_norm": 0.00035240311990492046,
"learning_rate": 4.032258064516129e-07,
"loss": 0.0045,
"num_tokens": 105434012.0,
"reward": 1.2136597633361816,
"reward_std": 0.14880546927452087,
"rewards/accuracy_reward": 0.6575520634651184,
"rewards/brier_reward": 0.7762646079063416,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.654980480670929,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00520833333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 665.9733276367188,
"completions/mean_terminated_length": 648.0150756835938,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.4736,
"grad_norm": 0.000533724669367075,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.0053,
"num_tokens": 106866827.0,
"reward": 1.2393426895141602,
"reward_std": 0.12406208366155624,
"rewards/accuracy_reward": 0.6953125,
"rewards/brier_reward": 0.7885677218437195,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9947916865348816,
"rewards/mean_confidence_reward": 0.6604166626930237,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3536.0,
"completions/mean_length": 709.642578125,
"completions/mean_terminated_length": 687.4515380859375,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.48,
"grad_norm": 0.00038802894414402544,
"learning_rate": 2.4193548387096775e-07,
"loss": 0.0096,
"num_tokens": 108365654.0,
"reward": 1.2166390419006348,
"reward_std": 0.15992063283920288,
"rewards/accuracy_reward": 0.662109375,
"rewards/brier_reward": 0.7776660919189453,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.6550130248069763,
"step": 75
},
{
"epoch": 0.48,
"eval_completions/clipped_ratio": 0.0029296875,
"eval_completions/max_length": 3102.875,
"eval_completions/max_terminated_length": 2502.375,
"eval_completions/mean_length": 680.3981399536133,
"eval_completions/mean_terminated_length": 670.3711547851562,
"eval_completions/min_length": 259.875,
"eval_completions/min_terminated_length": 259.875,
"eval_loss": 0.0,
"eval_num_tokens": 108365654.0,
"eval_reward": 1.210053026676178,
"eval_reward_std": 0.3010319583117962,
"eval_rewards/accuracy_reward": 0.646484375,
"eval_rewards/brier_reward": 0.7765380889177322,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9970703125,
"eval_rewards/mean_confidence_reward": 0.6592773273587227,
"eval_runtime": 208.3194,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 0.038,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013020833333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2417.0,
"completions/mean_length": 677.7591552734375,
"completions/mean_terminated_length": 673.302490234375,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.4864,
"grad_norm": 0.00038000475615262985,
"learning_rate": 1.6129032258064518e-07,
"loss": 0.0003,
"num_tokens": 109815244.0,
"reward": 1.2704503536224365,
"reward_std": 0.13931426405906677,
"rewards/accuracy_reward": 0.736328125,
"rewards/brier_reward": 0.8058609962463379,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9986979365348816,
"rewards/mean_confidence_reward": 0.6624674201011658,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3288.0,
"completions/mean_length": 673.58984375,
"completions/mean_terminated_length": 660.1686401367188,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.4928,
"grad_norm": 0.0003717007639352232,
"learning_rate": 8.064516129032259e-08,
"loss": 0.0043,
"num_tokens": 111259550.0,
"reward": 1.2620697021484375,
"reward_std": 0.1205185204744339,
"rewards/accuracy_reward": 0.7272135615348816,
"rewards/brier_reward": 0.800818681716919,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.656933605670929,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013020833333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 2643.0,
"completions/mean_length": 694.8952026367188,
"completions/mean_terminated_length": 690.4608764648438,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.4992,
"grad_norm": 0.0004337712307460606,
"learning_rate": 0.0,
"loss": 0.0007,
"num_tokens": 112741685.0,
"reward": 1.223556399345398,
"reward_std": 0.14543090760707855,
"rewards/accuracy_reward": 0.6705729365348816,
"rewards/brier_reward": 0.778479814529419,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998046875,
"rewards/mean_confidence_reward": 0.650390625,
"step": 78
},
{
"epoch": 0.4992,
"step": 78,
"total_flos": 0.0,
"train_loss": 0.009755346474822802,
"train_runtime": 16596.0705,
"train_samples_per_second": 0.904,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 78,
"num_input_tokens_seen": 112741685,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}