{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4992, "eval_steps": 15, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 587.9622802734375, "completions/mean_terminated_length": 518.0810546875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.00229196366854012, "learning_rate": 3.125e-07, "loss": 0.0123, "num_tokens": 1316862.0, "reward": 0.5813632607460022, "reward_std": 0.5275046825408936, "rewards/accuracy_reward": 0.2630208432674408, "rewards/brier_reward": 0.3104891777038574, "rewards/confidence_one_or_zero": 0.3606770932674408, "rewards/format_reward": 0.5891926884651184, "rewards/mean_confidence_reward": 0.826708972454071, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 599.0475463867188, "completions/mean_terminated_length": 545.8883056640625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0128, "grad_norm": 0.002298542531207204, "learning_rate": 6.25e-07, "loss": 0.0196, "num_tokens": 2652743.0, "reward": 0.6089980006217957, "reward_std": 0.5383763313293457, "rewards/accuracy_reward": 0.2760416567325592, "rewards/brier_reward": 0.32409217953681946, "rewards/confidence_one_or_zero": 0.3411458432674408, "rewards/format_reward": 0.6178385615348816, "rewards/mean_confidence_reward": 0.8342577815055847, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 601.3822021484375, "completions/mean_terminated_length": 550.6017456054688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0192, "grad_norm": 0.0022709358017891645, "learning_rate": 9.375000000000001e-07, "loss": 0.0136, "num_tokens": 3996250.0, "reward": 0.5906873941421509, "reward_std": 0.5221948623657227, "rewards/accuracy_reward": 0.2545572817325592, "rewards/brier_reward": 0.3135129511356354, "rewards/confidence_one_or_zero": 0.337890625, "rewards/format_reward": 0.61328125, "rewards/mean_confidence_reward": 0.8254140019416809, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 565.1041870117188, "completions/mean_terminated_length": 494.7676086425781, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0256, "grad_norm": 0.0018688800046220422, "learning_rate": 1.25e-06, "loss": 0.0203, "num_tokens": 5265922.0, "reward": 0.613066554069519, "reward_std": 0.5181887745857239, "rewards/accuracy_reward": 0.2669270932674408, "rewards/brier_reward": 0.32051023840904236, "rewards/confidence_one_or_zero": 0.330078125, "rewards/format_reward": 0.638671875, "rewards/mean_confidence_reward": 0.859596312046051, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 580.2845458984375, "completions/mean_terminated_length": 515.0059814453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.032, "grad_norm": 0.002923916792497039, "learning_rate": 1.5625e-06, "loss": 0.0191, "num_tokens": 6570791.0, "reward": 0.5732605457305908, "reward_std": 0.5266666412353516, "rewards/accuracy_reward": 0.23828125, "rewards/brier_reward": 0.29949304461479187, "rewards/confidence_one_or_zero": 0.3203125, "rewards/format_reward": 0.6087239384651184, "rewards/mean_confidence_reward": 0.8248130679130554, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 581.8854370117188, "completions/mean_terminated_length": 507.11700439453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0384, "grad_norm": 0.0017212529201060534, "learning_rate": 1.8750000000000003e-06, "loss": 0.0107, "num_tokens": 7879951.0, "reward": 0.6507998704910278, "reward_std": 0.4878300130367279, "rewards/accuracy_reward": 0.2682291567325592, "rewards/brier_reward": 0.3386862277984619, "rewards/confidence_one_or_zero": 0.3072916567325592, "rewards/format_reward": 0.6946614384651184, "rewards/mean_confidence_reward": 0.8365259766578674, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02213541666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 578.3861083984375, "completions/mean_terminated_length": 498.7596435546875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0448, "grad_norm": 0.010415066964924335, "learning_rate": 2.1875000000000002e-06, "loss": 0.0281, "num_tokens": 9185616.0, "reward": 0.6887897253036499, "reward_std": 0.4675048291683197, "rewards/accuracy_reward": 0.279296875, "rewards/brier_reward": 0.34956130385398865, "rewards/confidence_one_or_zero": 0.3326822817325592, "rewards/format_reward": 0.7486979365348816, "rewards/mean_confidence_reward": 0.8381659984588623, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 520.171875, "completions/mean_terminated_length": 463.4126892089844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0512, "grad_norm": 0.002210509032011032, "learning_rate": 2.5e-06, "loss": 0.0182, "num_tokens": 10394440.0, "reward": 0.7613718509674072, "reward_std": 0.44713592529296875, "rewards/accuracy_reward": 0.2975260317325592, "rewards/brier_reward": 0.38209569454193115, "rewards/confidence_one_or_zero": 0.2805989682674408, "rewards/format_reward": 0.8430989384651184, "rewards/mean_confidence_reward": 0.8710243105888367, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3597.0, "completions/mean_length": 493.451171875, "completions/mean_terminated_length": 445.9241638183594, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.0576, "grad_norm": 0.001208790927194059, "learning_rate": 2.8125e-06, "loss": 0.0073, "num_tokens": 11570397.0, "reward": 0.8752974271774292, "reward_std": 0.4269093871116638, "rewards/accuracy_reward": 0.3815104067325592, "rewards/brier_reward": 0.4621589183807373, "rewards/confidence_one_or_zero": 0.3411458432674408, "rewards/format_reward": 0.9069010615348816, "rewards/mean_confidence_reward": 0.8819381594657898, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3662.0, "completions/mean_length": 441.7220153808594, "completions/mean_terminated_length": 415.36328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.064, "grad_norm": 0.0010935115860775113, "learning_rate": 3.125e-06, "loss": 0.0015, "num_tokens": 12670578.0, "reward": 0.8601632118225098, "reward_std": 0.39231395721435547, "rewards/accuracy_reward": 0.3326822817325592, "rewards/brier_reward": 0.4344954490661621, "rewards/confidence_one_or_zero": 0.2962239682674408, "rewards/format_reward": 0.953125, "rewards/mean_confidence_reward": 0.8905722498893738, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3880.0, "completions/mean_length": 444.0716247558594, "completions/mean_terminated_length": 424.9515686035156, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0704, "grad_norm": 0.000973426504060626, "learning_rate": 3.4375e-06, "loss": 0.0011, "num_tokens": 13766424.0, "reward": 0.9216434359550476, "reward_std": 0.3478318154811859, "rewards/accuracy_reward": 0.3717447817325592, "rewards/brier_reward": 0.4884470999240875, "rewards/confidence_one_or_zero": 0.220703125, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.8892440795898438, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 468.76953125, "completions/mean_terminated_length": 435.40472412109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.0768, "grad_norm": 0.0008072754135355353, "learning_rate": 3.7500000000000005e-06, "loss": 0.0078, "num_tokens": 14894670.0, "reward": 0.9307572245597839, "reward_std": 0.32634100317955017, "rewards/accuracy_reward": 0.3763020932674408, "rewards/brier_reward": 0.5001646876335144, "rewards/confidence_one_or_zero": 0.1946614533662796, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.8821842074394226, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3957.0, "completions/mean_length": 458.2786560058594, "completions/mean_terminated_length": 427.22784423828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0832, "grad_norm": 0.0009474229882471263, "learning_rate": 4.0625000000000005e-06, "loss": 0.0075, "num_tokens": 16008858.0, "reward": 1.0004254579544067, "reward_std": 0.3447754979133606, "rewards/accuracy_reward": 0.4557291567325592, "rewards/brier_reward": 0.5620275139808655, "rewards/confidence_one_or_zero": 0.1796875, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.878938615322113, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 470.76434326171875, "completions/mean_terminated_length": 437.4178466796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.0896, "grad_norm": 0.0010997421341016889, "learning_rate": 4.3750000000000005e-06, "loss": 0.0066, "num_tokens": 17144288.0, "reward": 1.0370821952819824, "reward_std": 0.3125256896018982, "rewards/accuracy_reward": 0.4830729067325592, "rewards/brier_reward": 0.6021392941474915, "rewards/confidence_one_or_zero": 0.1302083283662796, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.8621549606323242, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 492.4739685058594, "completions/mean_terminated_length": 461.7150573730469, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.096, "grad_norm": 0.00073769356822595, "learning_rate": 4.6875000000000004e-06, "loss": 0.0056, "num_tokens": 18320432.0, "reward": 1.0514283180236816, "reward_std": 0.2952514588832855, "rewards/accuracy_reward": 0.49609375, "rewards/brier_reward": 0.6191139817237854, "rewards/confidence_one_or_zero": 0.083984375, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.8452711701393127, "step": 15 }, { "epoch": 0.096, "eval_completions/clipped_ratio": 0.01412259615384616, "eval_completions/max_length": 4017.25, "eval_completions/max_terminated_length": 2676.0, "eval_completions/mean_length": 537.4269104003906, "eval_completions/mean_terminated_length": 486.3598976135254, "eval_completions/min_length": 137.875, "eval_completions/min_terminated_length": 137.875, "eval_loss": 0.0, "eval_num_tokens": 18320432.0, "eval_reward": 1.057287573814392, "eval_reward_std": 0.43193066120147705, "eval_rewards/accuracy_reward": 0.4990234375, "eval_rewards/brier_reward": 0.6321354508399963, "eval_rewards/confidence_one_or_zero": 0.0654296875, "eval_rewards/format_reward": 0.9833984375, "eval_rewards/mean_confidence_reward": 0.8258691281080246, "eval_runtime": 266.5804, "eval_samples_per_second": 3.751, "eval_steps_per_second": 0.03, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 499.75653076171875, "completions/mean_terminated_length": 471.43963623046875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1024, "grad_norm": 0.000677171687129885, "learning_rate": 5e-06, "loss": 0.0082, "num_tokens": 19507706.0, "reward": 1.1004698276519775, "reward_std": 0.28478488326072693, "rewards/accuracy_reward": 0.5462239384651184, "rewards/brier_reward": 0.6664164066314697, "rewards/confidence_one_or_zero": 0.0631510391831398, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.8320556282997131, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 554.9486083984375, "completions/mean_terminated_length": 510.5978698730469, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1088, "grad_norm": 0.0006332960911095142, "learning_rate": 4.919354838709678e-06, "loss": 0.0081, "num_tokens": 20777075.0, "reward": 1.048607349395752, "reward_std": 0.2772809863090515, "rewards/accuracy_reward": 0.4791666567325592, "rewards/brier_reward": 0.6343072056770325, "rewards/confidence_one_or_zero": 0.0338541679084301, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.8093522191047668, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3753.0, "completions/mean_length": 577.5267333984375, "completions/mean_terminated_length": 540.4901733398438, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1152, "grad_norm": 0.0005637197173200548, "learning_rate": 4.838709677419355e-06, "loss": 0.0167, "num_tokens": 22072396.0, "reward": 1.126558780670166, "reward_std": 0.26943597197532654, "rewards/accuracy_reward": 0.5677083134651184, "rewards/brier_reward": 0.6971117854118347, "rewards/confidence_one_or_zero": 0.014973958022892475, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.7954492568969727, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3794.0, "completions/mean_length": 599.7174682617188, "completions/mean_terminated_length": 551.254150390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1216, "grad_norm": 0.0004333103133831173, "learning_rate": 4.758064516129033e-06, "loss": 0.0161, "num_tokens": 23407450.0, "reward": 1.1329493522644043, "reward_std": 0.2181943953037262, "rewards/accuracy_reward": 0.5735676884651184, "rewards/brier_reward": 0.7066380381584167, "rewards/confidence_one_or_zero": 0.009114583022892475, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.7850651741027832, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 630.1165771484375, "completions/mean_terminated_length": 589.0191040039062, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.128, "grad_norm": 0.0005656772991642356, "learning_rate": 4.67741935483871e-06, "loss": 0.0139, "num_tokens": 24786269.0, "reward": 1.1472342014312744, "reward_std": 0.21497444808483124, "rewards/accuracy_reward": 0.5859375, "rewards/brier_reward": 0.7221875190734863, "rewards/confidence_one_or_zero": 0.0013020833721384406, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.7606358528137207, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 713.6373901367188, "completions/mean_terminated_length": 662.2200927734375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1344, "grad_norm": 0.0004828801902476698, "learning_rate": 4.596774193548387e-06, "loss": 0.0197, "num_tokens": 26296008.0, "reward": 1.1559076309204102, "reward_std": 0.24854782223701477, "rewards/accuracy_reward": 0.6009114384651184, "rewards/brier_reward": 0.7297688126564026, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.7453190684318542, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3628.0, "completions/mean_length": 667.09375, "completions/mean_terminated_length": 614.9689331054688, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1408, "grad_norm": 0.035328447818756104, "learning_rate": 4.516129032258065e-06, "loss": 0.0153, "num_tokens": 27729288.0, "reward": 1.1439964771270752, "reward_std": 0.21810662746429443, "rewards/accuracy_reward": 0.5904948115348816, "rewards/brier_reward": 0.7150607705116272, "rewards/confidence_one_or_zero": 0.0045572915114462376, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.7736979126930237, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 759.8724365234375, "completions/mean_terminated_length": 693.4157104492188, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1472, "grad_norm": 0.00043078220915049314, "learning_rate": 4.435483870967742e-06, "loss": 0.0252, "num_tokens": 29316252.0, "reward": 1.1476705074310303, "reward_std": 0.24656717479228973, "rewards/accuracy_reward": 0.5885416865348816, "rewards/brier_reward": 0.7308734059333801, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9759114384651184, "rewards/mean_confidence_reward": 0.7125194668769836, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 693.0065307617188, "completions/mean_terminated_length": 663.9592895507812, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1536, "grad_norm": 0.0003661557857412845, "learning_rate": 4.35483870967742e-06, "loss": 0.0141, "num_tokens": 30791358.0, "reward": 1.1998891830444336, "reward_std": 0.19935479760169983, "rewards/accuracy_reward": 0.6471354365348816, "rewards/brier_reward": 0.7636963725090027, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.7080404162406921, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 710.5794677734375, "completions/mean_terminated_length": 652.2874145507812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.16, "grad_norm": 0.0003608058614190668, "learning_rate": 4.274193548387097e-06, "loss": 0.0155, "num_tokens": 32289752.0, "reward": 1.1741443872451782, "reward_std": 0.20154307782649994, "rewards/accuracy_reward": 0.6197916865348816, "rewards/brier_reward": 0.747363269329071, "rewards/confidence_one_or_zero": 0.0013020833721384406, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.6910156607627869, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 720.763671875, "completions/mean_terminated_length": 671.718017578125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1664, "grad_norm": 0.00038824151852168143, "learning_rate": 4.193548387096774e-06, "loss": 0.0144, "num_tokens": 33811813.0, "reward": 1.1451948881149292, "reward_std": 0.1946631371974945, "rewards/accuracy_reward": 0.5755208134651184, "rewards/brier_reward": 0.7324332594871521, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.6781575679779053, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 702.4681396484375, "completions/mean_terminated_length": 650.8810424804688, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1728, "grad_norm": 0.0003422526060603559, "learning_rate": 4.112903225806452e-06, "loss": 0.0196, "num_tokens": 35296276.0, "reward": 1.2147198915481567, "reward_std": 0.1851186752319336, "rewards/accuracy_reward": 0.6751301884651184, "rewards/brier_reward": 0.7725251317024231, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.6732291579246521, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 758.5983276367188, "completions/mean_terminated_length": 714.5692749023438, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1792, "grad_norm": 0.0003135671140626073, "learning_rate": 4.032258064516129e-06, "loss": 0.018, "num_tokens": 36873307.0, "reward": 1.2059857845306396, "reward_std": 0.183271586894989, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.7674267888069153, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.6653971672058105, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 756.6393432617188, "completions/mean_terminated_length": 725.9224243164062, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1856, "grad_norm": 0.00033678373438306153, "learning_rate": 3.951612903225807e-06, "loss": 0.0113, "num_tokens": 38440297.0, "reward": 1.204465627670288, "reward_std": 0.16857890784740448, "rewards/accuracy_reward": 0.654296875, "rewards/brier_reward": 0.7656884789466858, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.6629883050918579, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3894.0, "completions/mean_length": 784.548828125, "completions/mean_terminated_length": 738.6475219726562, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.192, "grad_norm": 0.00036860810359939933, "learning_rate": 3.870967741935484e-06, "loss": 0.0127, "num_tokens": 40054092.0, "reward": 1.1792367696762085, "reward_std": 0.18021389842033386, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.7542936205863953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.6539713740348816, "step": 30 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.020207331730769232, "eval_completions/max_length": 4084.875, "eval_completions/max_terminated_length": 2403.25, "eval_completions/mean_length": 779.5079650878906, "eval_completions/mean_terminated_length": 710.9605026245117, "eval_completions/min_length": 284.625, "eval_completions/min_terminated_length": 284.625, "eval_loss": 0.0, "eval_num_tokens": 40054092.0, "eval_reward": 1.178975060582161, "eval_reward_std": 0.3463897779583931, "eval_rewards/accuracy_reward": 0.625, "eval_rewards/brier_reward": 0.7544213905930519, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.978515625, "eval_rewards/mean_confidence_reward": 0.6520995870232582, "eval_runtime": 278.501, "eval_samples_per_second": 3.591, "eval_steps_per_second": 0.029, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 741.806640625, "completions/mean_terminated_length": 706.4993896484375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1984, "grad_norm": 0.0005164485774002969, "learning_rate": 3.7903225806451614e-06, "loss": 0.0135, "num_tokens": 41608667.0, "reward": 1.2111461162567139, "reward_std": 0.1950198858976364, "rewards/accuracy_reward": 0.66796875, "rewards/brier_reward": 0.767981767654419, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.6626952886581421, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 800.8229370117188, "completions/mean_terminated_length": 748.5184936523438, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2048, "grad_norm": 0.0003207188274245709, "learning_rate": 3.7096774193548392e-06, "loss": 0.0187, "num_tokens": 43245683.0, "reward": 1.1906991004943848, "reward_std": 0.18316665291786194, "rewards/accuracy_reward": 0.6399739384651184, "rewards/brier_reward": 0.7602912783622742, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.6509439945220947, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 734.4603271484375, "completions/mean_terminated_length": 694.6001586914062, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2112, "grad_norm": 0.0003439242427702993, "learning_rate": 3.6290322580645166e-06, "loss": 0.014, "num_tokens": 44783254.0, "reward": 1.1869686841964722, "reward_std": 0.18534022569656372, "rewards/accuracy_reward": 0.6321614384651184, "rewards/brier_reward": 0.7567365765571594, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.6561523079872131, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 749.0651245117188, "completions/mean_terminated_length": 729.3385620117188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2176, "grad_norm": 0.0005052844644524157, "learning_rate": 3.548387096774194e-06, "loss": 0.005, "num_tokens": 46343578.0, "reward": 1.1979600191116333, "reward_std": 0.16698740422725677, "rewards/accuracy_reward": 0.63671875, "rewards/brier_reward": 0.765698254108429, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6605142951011658, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 719.84375, "completions/mean_terminated_length": 711.0287475585938, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.224, "grad_norm": 0.0003193453885614872, "learning_rate": 3.4677419354838714e-06, "loss": 0.0049, "num_tokens": 47861770.0, "reward": 1.2130991220474243, "reward_std": 0.15396082401275635, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.7725391387939453, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6565755009651184, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 757.34375, "completions/mean_terminated_length": 728.845703125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2304, "grad_norm": 0.00032273278338834643, "learning_rate": 3.3870967741935484e-06, "loss": 0.0081, "num_tokens": 49439394.0, "reward": 1.208127498626709, "reward_std": 0.16208727657794952, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.7684553265571594, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6528971195220947, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3426.0, "completions/mean_length": 773.3659057617188, "completions/mean_terminated_length": 727.3095703125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.2368, "grad_norm": 0.00034387107007205486, "learning_rate": 3.306451612903226e-06, "loss": 0.0111, "num_tokens": 51037868.0, "reward": 1.201973557472229, "reward_std": 0.17515644431114197, "rewards/accuracy_reward": 0.6555989384651184, "rewards/brier_reward": 0.764611005783081, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.6488932371139526, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 754.919921875, "completions/mean_terminated_length": 713.0737915039062, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2432, "grad_norm": 0.00033718085614964366, "learning_rate": 3.225806451612903e-06, "loss": 0.0094, "num_tokens": 52609993.0, "reward": 1.2070956230163574, "reward_std": 0.1585657298564911, "rewards/accuracy_reward": 0.6588541865348816, "rewards/brier_reward": 0.7709488868713379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.643261730670929, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 787.2845458984375, "completions/mean_terminated_length": 750.2547607421875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2496, "grad_norm": 0.00032993199420161545, "learning_rate": 3.145161290322581e-06, "loss": 0.0113, "num_tokens": 54228542.0, "reward": 1.1909804344177246, "reward_std": 0.17440544068813324, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.759552001953125, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.643287718296051, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 774.5560302734375, "completions/mean_terminated_length": 750.5980224609375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.256, "grad_norm": 0.00032635065144859254, "learning_rate": 3.0645161290322584e-06, "loss": 0.0109, "num_tokens": 55832548.0, "reward": 1.203762173652649, "reward_std": 0.17923477292060852, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.7694905400276184, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.6456055045127869, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 762.052734375, "completions/mean_terminated_length": 735.8012084960938, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.2624, "grad_norm": 0.0003047576465178281, "learning_rate": 2.983870967741936e-06, "loss": 0.008, "num_tokens": 57414589.0, "reward": 1.2368627786636353, "reward_std": 0.1556222140789032, "rewards/accuracy_reward": 0.6959635615348816, "rewards/brier_reward": 0.7862125039100647, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.6431315541267395, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 753.7682495117188, "completions/mean_terminated_length": 711.90771484375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.2688, "grad_norm": 0.00031608319841325283, "learning_rate": 2.903225806451613e-06, "loss": 0.0118, "num_tokens": 58983569.0, "reward": 1.2014753818511963, "reward_std": 0.15760651230812073, "rewards/accuracy_reward": 0.6510416865348816, "rewards/brier_reward": 0.7662191390991211, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.6426758170127869, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 762.0319213867188, "completions/mean_terminated_length": 720.2748413085938, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2752, "grad_norm": 0.0003945046046283096, "learning_rate": 2.822580645161291e-06, "loss": 0.0117, "num_tokens": 60567154.0, "reward": 1.1954646110534668, "reward_std": 0.17542928457260132, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.7659163475036621, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.6377930045127869, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3965.0, "completions/mean_length": 721.765625, "completions/mean_terminated_length": 692.9639282226562, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2816, "grad_norm": 0.00038879967178218067, "learning_rate": 2.7419354838709676e-06, "loss": 0.0063, "num_tokens": 62088242.0, "reward": 1.22244393825531, "reward_std": 0.1557161509990692, "rewards/accuracy_reward": 0.6764323115348816, "rewards/brier_reward": 0.7775569558143616, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.6463867425918579, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 750.142578125, "completions/mean_terminated_length": 717.14599609375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.288, "grad_norm": 0.000633600982837379, "learning_rate": 2.6612903225806454e-06, "loss": 0.0088, "num_tokens": 63650541.0, "reward": 1.19394052028656, "reward_std": 0.15082021057605743, "rewards/accuracy_reward": 0.6302083134651184, "rewards/brier_reward": 0.7674251198768616, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6472656726837158, "step": 45 }, { "epoch": 0.288, "eval_completions/clipped_ratio": 0.0078125, "eval_completions/max_length": 3459.625, "eval_completions/max_terminated_length": 2331.375, "eval_completions/mean_length": 745.1733016967773, "eval_completions/mean_terminated_length": 718.9744033813477, "eval_completions/min_length": 278.5, "eval_completions/min_terminated_length": 278.5, "eval_loss": 0.0, "eval_num_tokens": 63650541.0, "eval_reward": 1.21938157081604, "eval_reward_std": 0.31588873267173767, "eval_rewards/accuracy_reward": 0.6689453125, "eval_rewards/brier_reward": 0.7785937488079071, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9912109375, "eval_rewards/mean_confidence_reward": 0.6538085639476776, "eval_runtime": 232.4576, "eval_samples_per_second": 4.302, "eval_steps_per_second": 0.034, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 693.5755615234375, "completions/mean_terminated_length": 680.2327270507812, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2944, "grad_norm": 0.0003804862208198756, "learning_rate": 2.580645161290323e-06, "loss": 0.0061, "num_tokens": 65133321.0, "reward": 1.2144540548324585, "reward_std": 0.1589406430721283, "rewards/accuracy_reward": 0.658203125, "rewards/brier_reward": 0.7752490043640137, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6563476920127869, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 785.98046875, "completions/mean_terminated_length": 724.5211791992188, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3008, "grad_norm": 0.00043931242544203997, "learning_rate": 2.5e-06, "loss": 0.0167, "num_tokens": 66754283.0, "reward": 1.2090681791305542, "reward_std": 0.1811358779668808, "rewards/accuracy_reward": 0.6666666865348816, "rewards/brier_reward": 0.770336925983429, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.64208984375, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 806.318359375, "completions/mean_terminated_length": 760.7188110351562, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3072, "grad_norm": 0.0003429017961025238, "learning_rate": 2.4193548387096776e-06, "loss": 0.0111, "num_tokens": 68411652.0, "reward": 1.1699422597885132, "reward_std": 0.18925714492797852, "rewards/accuracy_reward": 0.6015625, "rewards/brier_reward": 0.7526318430900574, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.6504232287406921, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3093.0, "completions/mean_length": 725.8483276367188, "completions/mean_terminated_length": 705.9849243164062, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3136, "grad_norm": 0.0005808745627291501, "learning_rate": 2.338709677419355e-06, "loss": 0.0078, "num_tokens": 69942339.0, "reward": 1.2273976802825928, "reward_std": 0.1579420566558838, "rewards/accuracy_reward": 0.6809895634651184, "rewards/brier_reward": 0.7796516418457031, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6576171517372131, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3871.0, "completions/mean_length": 707.404296875, "completions/mean_terminated_length": 687.4322509765625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.32, "grad_norm": 0.0003494632837828249, "learning_rate": 2.2580645161290324e-06, "loss": 0.0075, "num_tokens": 71440024.0, "reward": 1.2161612510681152, "reward_std": 0.15466436743736267, "rewards/accuracy_reward": 0.6614583134651184, "rewards/brier_reward": 0.7773616909980774, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.64990234375, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 710.1224365234375, "completions/mean_terminated_length": 681.2213134765625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3264, "grad_norm": 0.0006095463177189231, "learning_rate": 2.17741935483871e-06, "loss": 0.0083, "num_tokens": 72939964.0, "reward": 1.2039477825164795, "reward_std": 0.1723545491695404, "rewards/accuracy_reward": 0.6458333134651184, "rewards/brier_reward": 0.7711637616157532, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.6484701037406921, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 695.3828125, "completions/mean_terminated_length": 670.853759765625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3328, "grad_norm": 0.0003877031267620623, "learning_rate": 2.096774193548387e-06, "loss": 0.0083, "num_tokens": 74424472.0, "reward": 1.2081983089447021, "reward_std": 0.1538151502609253, "rewards/accuracy_reward": 0.6536458134651184, "rewards/brier_reward": 0.7712011337280273, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.650390625, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 706.1061401367188, "completions/mean_terminated_length": 692.8124389648438, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3392, "grad_norm": 0.00037506147054955363, "learning_rate": 2.0161290322580646e-06, "loss": 0.0051, "num_tokens": 75917531.0, "reward": 1.2101529836654663, "reward_std": 0.15496405959129333, "rewards/accuracy_reward": 0.6510416865348816, "rewards/brier_reward": 0.7731575965881348, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6464192867279053, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 675.6868896484375, "completions/mean_terminated_length": 662.2738647460938, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.3456, "grad_norm": 0.0005340283387340605, "learning_rate": 1.935483870967742e-06, "loss": 0.0052, "num_tokens": 77366122.0, "reward": 1.2335808277130127, "reward_std": 0.12662772834300995, "rewards/accuracy_reward": 0.6881510615348816, "rewards/brier_reward": 0.7829036712646484, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6430338025093079, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3214.0, "completions/mean_length": 704.0651245117188, "completions/mean_terminated_length": 692.987548828125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.352, "grad_norm": 0.0003404791059438139, "learning_rate": 1.8548387096774196e-06, "loss": 0.0043, "num_tokens": 78861758.0, "reward": 1.2172143459320068, "reward_std": 0.11808693408966064, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.777514636516571, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6418294310569763, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3948.0, "completions/mean_length": 744.0364990234375, "completions/mean_terminated_length": 724.2802734375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3584, "grad_norm": 0.0007370049715973437, "learning_rate": 1.774193548387097e-06, "loss": 0.0055, "num_tokens": 80419182.0, "reward": 1.2309666872024536, "reward_std": 0.16057714819908142, "rewards/accuracy_reward": 0.6842448115348816, "rewards/brier_reward": 0.7841861844062805, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6373047232627869, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 670.259765625, "completions/mean_terminated_length": 647.8106079101562, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3648, "grad_norm": 0.00045788983698002994, "learning_rate": 1.6935483870967742e-06, "loss": 0.0076, "num_tokens": 81861365.0, "reward": 1.2649039030075073, "reward_std": 0.1364418864250183, "rewards/accuracy_reward": 0.740234375, "rewards/brier_reward": 0.7960709929466248, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6371744871139526, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3940.0, "completions/mean_length": 693.5612182617188, "completions/mean_terminated_length": 689.1251220703125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3712, "grad_norm": 0.0005753524601459503, "learning_rate": 1.6129032258064516e-06, "loss": 0.0031, "num_tokens": 83342299.0, "reward": 1.251284122467041, "reward_std": 0.14745217561721802, "rewards/accuracy_reward": 0.7135416865348816, "rewards/brier_reward": 0.7903158068656921, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6346353888511658, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 674.2213745117188, "completions/mean_terminated_length": 658.555908203125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.3776, "grad_norm": 0.0003959361056331545, "learning_rate": 1.5322580645161292e-06, "loss": 0.0041, "num_tokens": 84788295.0, "reward": 1.2028402090072632, "reward_std": 0.14612236618995667, "rewards/accuracy_reward": 0.6399739384651184, "rewards/brier_reward": 0.7702505588531494, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6440039277076721, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 687.2760620117188, "completions/mean_terminated_length": 658.179931640625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.384, "grad_norm": 0.00045462019625119865, "learning_rate": 1.4516129032258066e-06, "loss": 0.007, "num_tokens": 86257199.0, "reward": 1.214907169342041, "reward_std": 0.15338455140590668, "rewards/accuracy_reward": 0.6653645634651184, "rewards/brier_reward": 0.7735514640808105, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.6363281607627869, "step": 60 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 2977.75, "eval_completions/max_terminated_length": 2288.25, "eval_completions/mean_length": 679.1435546875, "eval_completions/mean_terminated_length": 669.0825271606445, "eval_completions/min_length": 242.75, "eval_completions/min_terminated_length": 242.75, "eval_loss": 0.0, "eval_num_tokens": 86257199.0, "eval_reward": 1.2179948389530182, "eval_reward_std": 0.30138808116316795, "eval_rewards/accuracy_reward": 0.662109375, "eval_rewards/brier_reward": 0.7767968773841858, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9970703125, "eval_rewards/mean_confidence_reward": 0.6503905951976776, "eval_runtime": 200.9214, "eval_samples_per_second": 4.977, "eval_steps_per_second": 0.04, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 725.111328125, "completions/mean_terminated_length": 687.3857421875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3904, "grad_norm": 0.0005666792858392, "learning_rate": 1.3709677419354838e-06, "loss": 0.0108, "num_tokens": 87793234.0, "reward": 1.1888753175735474, "reward_std": 0.1578804850578308, "rewards/accuracy_reward": 0.626953125, "rewards/brier_reward": 0.7625033259391785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.6428385376930237, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 704.2174682617188, "completions/mean_terminated_length": 693.140380859375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3968, "grad_norm": 0.0003873308305628598, "learning_rate": 1.2903225806451614e-06, "loss": 0.0032, "num_tokens": 89288952.0, "reward": 1.2213973999023438, "reward_std": 0.15189020335674286, "rewards/accuracy_reward": 0.66796875, "rewards/brier_reward": 0.779370129108429, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6493815183639526, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3277.0, "completions/mean_length": 681.048828125, "completions/mean_terminated_length": 658.67041015625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.4032, "grad_norm": 0.0003833891823887825, "learning_rate": 1.2096774193548388e-06, "loss": 0.0069, "num_tokens": 90747043.0, "reward": 1.2279908657073975, "reward_std": 0.13211821019649506, "rewards/accuracy_reward": 0.6790364384651184, "rewards/brier_reward": 0.7834423184394836, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6490560173988342, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3750.0, "completions/mean_length": 703.8782958984375, "completions/mean_terminated_length": 683.8853759765625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4096, "grad_norm": 0.00041995308129116893, "learning_rate": 1.1290322580645162e-06, "loss": 0.0057, "num_tokens": 92239016.0, "reward": 1.2233237028121948, "reward_std": 0.15987294912338257, "rewards/accuracy_reward": 0.6751301884651184, "rewards/brier_reward": 0.7780142426490784, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6548828482627869, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3156.0, "completions/mean_length": 684.9453125, "completions/mean_terminated_length": 671.5686645507812, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.416, "grad_norm": 0.0003942767216358334, "learning_rate": 1.0483870967741936e-06, "loss": 0.0035, "num_tokens": 93708572.0, "reward": 1.244889497756958, "reward_std": 0.13460424542427063, "rewards/accuracy_reward": 0.7037760615348816, "rewards/brier_reward": 0.7905468940734863, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6529297232627869, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 692.6686401367188, "completions/mean_terminated_length": 677.0875854492188, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4224, "grad_norm": 0.0004131811438128352, "learning_rate": 9.67741935483871e-07, "loss": 0.0056, "num_tokens": 95184503.0, "reward": 1.2223446369171143, "reward_std": 0.14547114074230194, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.77540522813797, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6509440541267395, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 680.1510620117188, "completions/mean_terminated_length": 660.018310546875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4288, "grad_norm": 0.0007328905630856752, "learning_rate": 8.870967741935485e-07, "loss": 0.006, "num_tokens": 96639215.0, "reward": 1.2431633472442627, "reward_std": 0.15597683191299438, "rewards/accuracy_reward": 0.6998698115348816, "rewards/brier_reward": 0.7923030853271484, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6509439945220947, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3798.0, "completions/mean_length": 676.5755615234375, "completions/mean_terminated_length": 665.408203125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4352, "grad_norm": 0.00044186998275108635, "learning_rate": 8.064516129032258e-07, "loss": 0.0052, "num_tokens": 98093275.0, "reward": 1.2231853008270264, "reward_std": 0.1525024026632309, "rewards/accuracy_reward": 0.66796875, "rewards/brier_reward": 0.7816438674926758, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6505208611488342, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3791.0, "completions/mean_length": 694.0651245117188, "completions/mean_terminated_length": 671.77197265625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.4416, "grad_norm": 0.00043534423457458615, "learning_rate": 7.258064516129033e-07, "loss": 0.0063, "num_tokens": 99573463.0, "reward": 1.2070882320404053, "reward_std": 0.1573706716299057, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.7722363471984863, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.6500000357627869, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 685.0462646484375, "completions/mean_terminated_length": 673.9065551757812, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.448, "grad_norm": 0.0004712227964773774, "learning_rate": 6.451612903225807e-07, "loss": 0.0032, "num_tokens": 101040030.0, "reward": 1.2174032926559448, "reward_std": 0.15992116928100586, "rewards/accuracy_reward": 0.6627604365348816, "rewards/brier_reward": 0.7752881050109863, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.65478515625, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 680.4296875, "completions/mean_terminated_length": 669.2749633789062, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.4544, "grad_norm": 0.000348559144185856, "learning_rate": 5.645161290322581e-07, "loss": 0.0062, "num_tokens": 102493714.0, "reward": 1.252179503440857, "reward_std": 0.13250502943992615, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.7927572131156921, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6566406488418579, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 669.994140625, "completions/mean_terminated_length": 654.309326171875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4608, "grad_norm": 0.0006249594734981656, "learning_rate": 4.838709677419355e-07, "loss": 0.0041, "num_tokens": 103937737.0, "reward": 1.206282615661621, "reward_std": 0.16145284473896027, "rewards/accuracy_reward": 0.6451823115348816, "rewards/brier_reward": 0.7719270586967468, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.653124988079071, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3851.0, "completions/mean_length": 710.1790771484375, "completions/mean_terminated_length": 692.4522094726562, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4672, "grad_norm": 0.00035240311990492046, "learning_rate": 4.032258064516129e-07, "loss": 0.0045, "num_tokens": 105434012.0, "reward": 1.2136597633361816, "reward_std": 0.14880546927452087, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.7762646079063416, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.654980480670929, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 665.9733276367188, "completions/mean_terminated_length": 648.0150756835938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4736, "grad_norm": 0.000533724669367075, "learning_rate": 3.2258064516129035e-07, "loss": 0.0053, "num_tokens": 106866827.0, "reward": 1.2393426895141602, "reward_std": 0.12406208366155624, "rewards/accuracy_reward": 0.6953125, "rewards/brier_reward": 0.7885677218437195, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.6604166626930237, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 709.642578125, "completions/mean_terminated_length": 687.4515380859375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.48, "grad_norm": 0.00038802894414402544, "learning_rate": 2.4193548387096775e-07, "loss": 0.0096, "num_tokens": 108365654.0, "reward": 1.2166390419006348, "reward_std": 0.15992063283920288, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.7776660919189453, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.6550130248069763, "step": 75 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 3102.875, "eval_completions/max_terminated_length": 2502.375, "eval_completions/mean_length": 680.3981399536133, "eval_completions/mean_terminated_length": 670.3711547851562, "eval_completions/min_length": 259.875, "eval_completions/min_terminated_length": 259.875, "eval_loss": 0.0, "eval_num_tokens": 108365654.0, "eval_reward": 1.210053026676178, "eval_reward_std": 0.3010319583117962, "eval_rewards/accuracy_reward": 0.646484375, "eval_rewards/brier_reward": 0.7765380889177322, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9970703125, "eval_rewards/mean_confidence_reward": 0.6592773273587227, "eval_runtime": 208.3194, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.038, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 677.7591552734375, "completions/mean_terminated_length": 673.302490234375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4864, "grad_norm": 0.00038000475615262985, "learning_rate": 1.6129032258064518e-07, "loss": 0.0003, "num_tokens": 109815244.0, "reward": 1.2704503536224365, "reward_std": 0.13931426405906677, "rewards/accuracy_reward": 0.736328125, "rewards/brier_reward": 0.8058609962463379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6624674201011658, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 673.58984375, "completions/mean_terminated_length": 660.1686401367188, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4928, "grad_norm": 0.0003717007639352232, "learning_rate": 8.064516129032259e-08, "loss": 0.0043, "num_tokens": 111259550.0, "reward": 1.2620697021484375, "reward_std": 0.1205185204744339, "rewards/accuracy_reward": 0.7272135615348816, "rewards/brier_reward": 0.800818681716919, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.656933605670929, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 694.8952026367188, "completions/mean_terminated_length": 690.4608764648438, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4992, "grad_norm": 0.0004337712307460606, "learning_rate": 0.0, "loss": 0.0007, "num_tokens": 112741685.0, "reward": 1.223556399345398, "reward_std": 0.14543090760707855, "rewards/accuracy_reward": 0.6705729365348816, "rewards/brier_reward": 0.778479814529419, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.650390625, "step": 78 }, { "epoch": 0.4992, "step": 78, "total_flos": 0.0, "train_loss": 0.009755346474822802, "train_runtime": 16596.0705, "train_samples_per_second": 0.904, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 78, "num_input_tokens_seen": 112741685, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }