{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7557821869850159, "adv/mean_abs_reasoning": 0.28040462732315063, "adv/mean_abs_step_conf": 0.6320238709449768, "adv/ratio_final_to_reasoning": 2.69532708571895, "adv/ratio_step_to_reasoning": 2.2539709026149723, "adv/std_final_conf": 0.9257818460464478, "adv/std_reasoning": 0.5727222561836243, "adv/std_step_conf": 0.8462716937065125, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.59765625, "calib/ece": 0.23243902439024394, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.008130081300813009, "calib/gap": -0.04614489795918364, "calib/mean_conf": 0.6646341463414636, "calib/mu_c": 0.6552551020408164, "calib/mu_w": 0.7014, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05016260162601624, "calib/std_conf": 0.05917169015101882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583372, "calib/step_q_c_n": 2500.0, "calib/step_q_gap": -0.0778082748585287, "calib/step_q_w": 0.6611802748585287, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 750.2265625, "completions/mean_terminated_length": 780.7235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0010666666666666667, "grad_norm": 0.3166561424732208, "kl": 0.00047022104263305664, "learning_rate": 0.0, "loss": -0.1462, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01929234340786934, "mask/share_reasoning": 0.7498296499252319, "mask/share_step_conf": 0.19181546568870544, "num_tokens": 299642.0, "reward": 0.5737828612327576, "reward_std": 0.07307003438472748, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7708241939544678, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.0314289852976799, "step": 1 }, { "adv/mean_abs_final_conf": 0.7929245233535767, "adv/mean_abs_reasoning": 0.4050842523574829, "adv/mean_abs_step_conf": 0.6475293636322021, "adv/ratio_final_to_reasoning": 1.9574311238685933, "adv/ratio_step_to_reasoning": 1.598505396997669, "adv/std_final_conf": 0.9301473498344421, "adv/std_reasoning": 0.6612725853919983, "adv/std_step_conf": 0.8531926274299622, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.078125, "calib/ece": 0.04704724409448811, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.008169981916817282, "calib/mean_conf": 0.6691732283464566, "calib/mu_c": 0.6717142857142857, "calib/mu_w": 0.6635443037974684, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013622047244094477, "calib/std_conf": 0.060200661111313364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5911686697057605, "calib/step_q_c_n": 2413.0, "calib/step_q_gap": -0.011375410898773475, "calib/step_q_w": 0.602544080604534, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 867.8828125, "completions/mean_terminated_length": 871.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.0021333333333333334, "grad_norm": 0.5694341063499451, "kl": 0.0006206929683685303, "learning_rate": 2.5000000000000004e-07, "loss": -0.0783, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01878987066447735, "mask/share_reasoning": 0.7856365442276001, "mask/share_step_conf": 0.1916673481464386, "num_tokens": 625108.0, "reward": 0.5773488879203796, "reward_std": 0.08374661952257156, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7790628671646118, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.04047856479883194, "step": 2 }, { "adv/mean_abs_final_conf": 0.7365277409553528, "adv/mean_abs_reasoning": 0.3439093828201294, "adv/mean_abs_step_conf": 0.610215425491333, "adv/ratio_final_to_reasoning": 2.1416331677713187, "adv/ratio_step_to_reasoning": 1.7743494535898903, "adv/std_final_conf": 0.928483247756958, "adv/std_reasoning": 0.6401665806770325, "adv/std_step_conf": 0.8361523747444153, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.54296875, "calib/ece": 0.14905882352941177, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.026731818181818134, "calib/mean_conf": 0.6652156862745098, "calib/mu_c": 0.65945, "calib/mu_w": 0.6861818181818181, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014980392156862742, "calib/std_conf": 0.05215223331953581, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583673786407767, "calib/step_q_c_n": 2575.0, "calib/step_q_gap": -0.02829930776263656, "calib/step_q_w": 0.6119730941704036, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 798.23828125, "completions/mean_terminated_length": 804.5236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0032, "grad_norm": 0.2995379865169525, "kl": 0.0009461045265197754, "learning_rate": 5.000000000000001e-07, "loss": -0.0562, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019037947058677673, "mask/share_reasoning": 0.7804015278816223, "mask/share_step_conf": 0.1927480697631836, "num_tokens": 934713.0, "reward": 0.5961868762969971, "reward_std": 0.07548847794532776, "rewards/accuracy_reward_step": 0.78125, "rewards/final_brier_reward_step": 0.8017418384552002, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.03516314923763275, "step": 3 }, { "adv/mean_abs_final_conf": 0.7282766699790955, "adv/mean_abs_reasoning": 0.32462894916534424, "adv/mean_abs_step_conf": 0.6020452976226807, "adv/ratio_final_to_reasoning": 2.243412584895995, "adv/ratio_step_to_reasoning": 1.8545644162992967, "adv/std_final_conf": 0.9282972812652588, "adv/std_reasoning": 0.6184795498847961, "adv/std_step_conf": 0.8494290709495544, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.33203125, "calib/ece": 0.11106299212598428, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.030018018018017845, "calib/mean_conf": 0.6750787401574803, "calib/mu_c": 0.6663333333333333, "calib/mu_w": 0.6963513513513512, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03874015748031496, "calib/std_conf": 0.0618783549220696, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5895343035343035, "calib/step_q_c_n": 2405.0, "calib/step_q_gap": -0.021051139503671212, "calib/step_q_w": 0.6105854430379747, "calib/step_q_w_n": 1264.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 871.1015625, "completions/mean_terminated_length": 881.4308471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.004266666666666667, "grad_norm": 0.8032536506652832, "kl": 0.0005264580249786377, "learning_rate": 7.5e-07, "loss": -0.0951, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.017951160669326782, "mask/share_reasoning": 0.7815943956375122, "mask/share_step_conf": 0.1887357234954834, "num_tokens": 1263883.0, "reward": 0.5748437643051147, "reward_std": 0.07029317319393158, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7701238393783569, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.04050109535455704, "step": 4 }, { "adv/mean_abs_final_conf": 0.7468620538711548, "adv/mean_abs_reasoning": 0.28012946248054504, "adv/mean_abs_step_conf": 0.45508497953414917, "adv/ratio_final_to_reasoning": 2.6661317494336187, "adv/ratio_step_to_reasoning": 1.6245523605563437, "adv/std_final_conf": 0.930158793926239, "adv/std_reasoning": 0.5726768970489502, "adv/std_step_conf": 0.7269405722618103, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.49609375, "calib/ece": 0.17788000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.012, "calib/gap": -0.04787837094111158, "calib/mean_conf": 0.67148, "calib/mu_c": 0.653860759493671, "calib/mu_w": 0.7017391304347825, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10867999999999997, "calib/std_conf": 0.0702894700506413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5855629139072848, "calib/step_q_c_n": 1963.0, "calib/step_q_gap": -0.04010642247715468, "calib/step_q_w": 0.6256693363844394, "calib/step_q_w_n": 1748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 847.38671875, "completions/mean_terminated_length": 860.8373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.005333333333333333, "grad_norm": 0.24338386952877045, "kl": 0.0006460249423980713, "learning_rate": 1.0000000000000002e-06, "loss": -0.0796, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0191742405295372, "mask/share_reasoning": 0.7722139358520508, "mask/share_step_conf": 0.1929868459701538, "num_tokens": 1587502.0, "reward": 0.530252993106842, "reward_std": 0.06734541058540344, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7213417887687683, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.020414207130670547, "step": 5 }, { "adv/mean_abs_final_conf": 0.745267927646637, "adv/mean_abs_reasoning": 0.2637953758239746, "adv/mean_abs_step_conf": 0.6003158688545227, "adv/ratio_final_to_reasoning": 2.825174343252853, "adv/ratio_step_to_reasoning": 2.2756876119583747, "adv/std_final_conf": 0.9142794013023376, "adv/std_reasoning": 0.5482735633850098, "adv/std_step_conf": 0.8366710543632507, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.81640625, "calib/ece": 0.08827450980392164, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.0011739983188567837, "calib/mean_conf": 0.6561960784313725, "calib/mu_c": 0.6558139534883721, "calib/mu_w": 0.6569879518072289, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.034980392156862744, "calib/std_conf": 0.05665202334716018, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5864367816091954, "calib/step_q_c_n": 2175.0, "calib/step_q_gap": -0.004973706636735797, "calib/step_q_w": 0.5914104882459312, "calib/step_q_w_n": 1106.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 753.89453125, "completions/mean_terminated_length": 759.8306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.0064, "grad_norm": 0.36567795276641846, "kl": 0.001254260540008545, "learning_rate": 1.25e-06, "loss": -0.0765, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02114243432879448, "mask/share_reasoning": 0.7748997211456299, "mask/share_step_conf": 0.19614538550376892, "num_tokens": 1886451.0, "reward": 0.5701749324798584, "reward_std": 0.05616578459739685, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.773360550403595, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.033395521342754364, "step": 6 }, { "adv/mean_abs_final_conf": 0.7575596570968628, "adv/mean_abs_reasoning": 0.36128178238868713, "adv/mean_abs_step_conf": 0.632476806640625, "adv/ratio_final_to_reasoning": 2.096866473831326, "adv/ratio_step_to_reasoning": 1.750646828796286, "adv/std_final_conf": 0.9292610287666321, "adv/std_reasoning": 0.6404456496238708, "adv/std_step_conf": 0.8506557941436768, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 15.05078125, "calib/ece": 0.08143442622950824, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01089819376026302, "calib/mean_conf": 0.6709426229508196, "calib/mu_c": 0.6678160919540229, "calib/mu_w": 0.6787142857142859, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.019631147540983645, "calib/std_conf": 0.04872485466343217, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5869490800171159, "calib/step_q_c_n": 2337.0, "calib/step_q_gap": -0.056236935814018674, "calib/step_q_w": 0.6431860158311345, "calib/step_q_w_n": 1516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 861.77734375, "completions/mean_terminated_length": 896.8088989257812, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.007466666666666667, "grad_norm": 0.2327389270067215, "kl": 0.0005173087120056152, "learning_rate": 1.5e-06, "loss": -0.1872, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01713840663433075, "mask/share_reasoning": 0.7627436518669128, "mask/share_step_conf": 0.1810554563999176, "num_tokens": 2214490.0, "reward": 0.5554234981536865, "reward_std": 0.0987781286239624, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7499246001243591, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.034359902143478394, "step": 7 }, { "adv/mean_abs_final_conf": 0.7442705631256104, "adv/mean_abs_reasoning": 0.3813665807247162, "adv/mean_abs_step_conf": 0.5766507387161255, "adv/ratio_final_to_reasoning": 1.9515883161845557, "adv/ratio_step_to_reasoning": 1.5120641604733904, "adv/std_final_conf": 0.9289296269416809, "adv/std_reasoning": 0.6613633036613464, "adv/std_step_conf": 0.8194501399993896, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.109375, "calib/ece": 0.12160642570281124, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.012048192771084338, "calib/gap": -0.03541341991341973, "calib/mean_conf": 0.672128514056225, "calib/mu_c": 0.6601818181818182, "calib/mu_w": 0.6955952380952379, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06554216867469877, "calib/std_conf": 0.06362452254971758, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5869551427276846, "calib/step_q_c_n": 2207.0, "calib/step_q_gap": -0.036259788036914986, "calib/step_q_w": 0.6232149307645996, "calib/step_q_w_n": 1661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 860.09765625, "completions/mean_terminated_length": 880.7400512695312, "completions/min_length": 0.0, "completions/min_terminated_length": 339.0, "epoch": 0.008533333333333334, "grad_norm": 0.2793032228946686, "kl": 0.0006050467491149902, "learning_rate": 1.75e-06, "loss": -0.1417, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018345724791288376, "mask/share_reasoning": 0.7716814875602722, "mask/share_step_conf": 0.18653526902198792, "num_tokens": 2541187.0, "reward": 0.5402415990829468, "reward_std": 0.09596795588731766, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7357991933822632, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.021246377378702164, "step": 8 }, { "adv/mean_abs_final_conf": 0.7212363481521606, "adv/mean_abs_reasoning": 0.2584291398525238, "adv/mean_abs_step_conf": 0.6063222885131836, "adv/ratio_final_to_reasoning": 2.790847613251912, "adv/ratio_step_to_reasoning": 2.3461839050317232, "adv/std_final_conf": 0.9257317185401917, "adv/std_reasoning": 0.572688639163971, "adv/std_step_conf": 0.8504953384399414, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.3828125, "calib/ece": 0.10083665338645421, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.027888446215139442, "calib/gap": 0.0267156366092538, "calib/mean_conf": 0.6725498007968128, "calib/mu_c": 0.6792553191489362, "calib/mu_w": 0.6525396825396824, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012191235059760962, "calib/std_conf": 0.07387855134317332, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6040762041696621, "calib/step_q_c_n": 2782.0, "calib/step_q_gap": 0.007298426391884272, "calib/step_q_w": 0.5967777777777779, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 860.6171875, "completions/mean_terminated_length": 870.8221435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.0096, "grad_norm": 1.0299285650253296, "kl": 0.0008474588394165039, "learning_rate": 2.0000000000000003e-06, "loss": -0.1154, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01888362690806389, "mask/share_reasoning": 0.7786691188812256, "mask/share_step_conf": 0.19072850048542023, "num_tokens": 2869041.0, "reward": 0.5908889770507812, "reward_std": 0.07018930464982986, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7949097156524658, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.043899379670619965, "step": 9 }, { "adv/mean_abs_final_conf": 0.7579138875007629, "adv/mean_abs_reasoning": 0.41535621881484985, "adv/mean_abs_step_conf": 0.6774593591690063, "adv/ratio_final_to_reasoning": 1.8247322494974185, "adv/ratio_step_to_reasoning": 1.6310321802861756, "adv/std_final_conf": 0.9309507608413696, "adv/std_reasoning": 0.7014256119728088, "adv/std_step_conf": 0.8841572403907776, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 15.11328125, "calib/ece": 0.11116, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.018304761904761713, "calib/mean_conf": 0.6741199999999999, "calib/mu_c": 0.6686285714285716, "calib/mu_w": 0.6869333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04264000000000001, "calib/std_conf": 0.06341155730621982, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5946551724137931, "calib/step_q_c_n": 2436.0, "calib/step_q_gap": -0.03770351565319929, "calib/step_q_w": 0.6323586880669924, "calib/step_q_w_n": 1433.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 896.28515625, "completions/mean_terminated_length": 910.511962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 366.0, "epoch": 0.010666666666666666, "grad_norm": 0.38899895548820496, "kl": 0.001188516616821289, "learning_rate": 2.25e-06, "loss": -0.1247, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018132302910089493, "mask/share_reasoning": 0.7785909175872803, "mask/share_step_conf": 0.18765179812908173, "num_tokens": 3205290.0, "reward": 0.564469575881958, "reward_std": 0.1037634089589119, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7593957185745239, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.03751220554113388, "step": 10 }, { "adv/mean_abs_final_conf": 0.7381426095962524, "adv/mean_abs_reasoning": 0.2654629647731781, "adv/mean_abs_step_conf": 0.5914384722709656, "adv/ratio_final_to_reasoning": 2.7805860234664004, "adv/ratio_step_to_reasoning": 2.227950979061481, "adv/std_final_conf": 0.9269818663597107, "adv/std_reasoning": 0.54827481508255, "adv/std_step_conf": 0.8175468444824219, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 15.15234375, "calib/ece": 0.17334645669291335, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.015748031496062992, "calib/gap": -0.05524509803921562, "calib/mean_conf": 0.6788582677165355, "calib/mu_c": 0.6605882352941177, "calib/mu_w": 0.7158333333333333, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09145669291338585, "calib/std_conf": 0.06792100835917231, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5851105216622458, "calib/step_q_c_n": 2262.0, "calib/step_q_gap": -0.0441040732666349, "calib/step_q_w": 0.6292145949288807, "calib/step_q_w_n": 1617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 910.31640625, "completions/mean_terminated_length": 913.8863525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.011733333333333333, "grad_norm": 1.270994782447815, "kl": 0.0012431144714355469, "learning_rate": 2.5e-06, "loss": -0.0053, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018155530095100403, "mask/share_reasoning": 0.7854121923446655, "mask/share_step_conf": 0.19252607226371765, "num_tokens": 3542811.0, "reward": 0.5488805770874023, "reward_std": 0.06458449363708496, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7436433434486389, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.02286791056394577, "step": 11 }, { "adv/mean_abs_final_conf": 0.7413728833198547, "adv/mean_abs_reasoning": 0.3635881543159485, "adv/mean_abs_step_conf": 0.6452409625053406, "adv/ratio_final_to_reasoning": 2.039045756907749, "adv/ratio_step_to_reasoning": 1.7746479219579945, "adv/std_final_conf": 0.9299033880233765, "adv/std_reasoning": 0.6614682674407959, "adv/std_step_conf": 0.8669188618659973, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.73828125, "calib/ece": 0.12053061224489801, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.012244897959183673, "calib/gap": -0.0223040752351098, "calib/mean_conf": 0.6707346938775509, "calib/mu_c": 0.6654545454545454, "calib/mu_w": 0.6877586206896552, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013999999999999999, "calib/std_conf": 0.06331224065489251, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5873842302878598, "calib/step_q_c_n": 2397.0, "calib/step_q_gap": -0.06773931622376816, "calib/step_q_w": 0.655123546511628, "calib/step_q_w_n": 1376.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 778.83984375, "completions/mean_terminated_length": 810.5, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.0128, "grad_norm": 0.2657943665981293, "kl": 0.002060413360595703, "learning_rate": 2.7500000000000004e-06, "loss": -0.1714, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019777968525886536, "mask/share_reasoning": 0.743057906627655, "mask/share_step_conf": 0.19810162484645844, "num_tokens": 3846370.0, "reward": 0.5683996081352234, "reward_std": 0.10034461319446564, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7643597722053528, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.03493950143456459, "step": 12 }, { "adv/mean_abs_final_conf": 0.7224041819572449, "adv/mean_abs_reasoning": 0.39203640818595886, "adv/mean_abs_step_conf": 0.6825897693634033, "adv/ratio_final_to_reasoning": 1.8426966650877463, "adv/ratio_step_to_reasoning": 1.741138718523365, "adv/std_final_conf": 0.9304549098014832, "adv/std_reasoning": 0.6816896200180054, "adv/std_step_conf": 0.883434534072876, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.984375, "calib/ece": 0.0858964143426295, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0032125970955758643, "calib/mean_conf": 0.666374501992032, "calib/mu_c": 0.6671808510638297, "calib/mu_w": 0.6639682539682539, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.001633466135458168, "calib/std_conf": 0.05525748482093671, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.588135593220339, "calib/step_q_c_n": 2478.0, "calib/step_q_gap": -0.02999507828601311, "calib/step_q_w": 0.6181306715063521, "calib/step_q_w_n": 1102.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 817.25, "completions/mean_terminated_length": 830.2222900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.013866666666666666, "grad_norm": 0.33146727085113525, "kl": 0.0026552677154541016, "learning_rate": 3e-06, "loss": -0.1326, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019377902150154114, "mask/share_reasoning": 0.7720924615859985, "mask/share_step_conf": 0.19290460646152496, "num_tokens": 4160178.0, "reward": 0.5818509459495544, "reward_std": 0.09568098932504654, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.787639856338501, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.033093370497226715, "step": 13 }, { "adv/mean_abs_final_conf": 0.7974610328674316, "adv/mean_abs_reasoning": 0.41913557052612305, "adv/mean_abs_step_conf": 0.6118457913398743, "adv/ratio_final_to_reasoning": 1.902632677695221, "adv/ratio_step_to_reasoning": 1.4597801627092883, "adv/std_final_conf": 0.9300772547721863, "adv/std_reasoning": 0.6614927053451538, "adv/std_step_conf": 0.837138295173645, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 16.3359375, "calib/ece": 0.1026859504132232, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.01652892561983471, "calib/gap": -0.014451490171211057, "calib/mean_conf": 0.6792975206611571, "calib/mu_c": 0.6747590361445784, "calib/mu_w": 0.6892105263157895, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.048016528925619875, "calib/std_conf": 0.06787143225368777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5958808618504435, "calib/step_q_c_n": 2367.0, "calib/step_q_gap": -0.06811930343881267, "calib/step_q_w": 0.6640001652892562, "calib/step_q_w_n": 1815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 876.8671875, "completions/mean_terminated_length": 927.5950317382812, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.014933333333333333, "grad_norm": 0.644323468208313, "kl": 0.0029511451721191406, "learning_rate": 3.2500000000000002e-06, "loss": -0.2248, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017249684780836105, "mask/share_reasoning": 0.7453184723854065, "mask/share_step_conf": 0.1827443391084671, "num_tokens": 4490056.0, "reward": 0.5408897995948792, "reward_std": 0.10242260992527008, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7313886880874634, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.03164093196392059, "step": 14 }, { "adv/mean_abs_final_conf": 0.7296807169914246, "adv/mean_abs_reasoning": 0.40887928009033203, "adv/mean_abs_step_conf": 0.6450937986373901, "adv/ratio_final_to_reasoning": 1.784587169176729, "adv/ratio_step_to_reasoning": 1.5777121269017893, "adv/std_final_conf": 0.9294079542160034, "adv/std_reasoning": 0.7013062834739685, "adv/std_step_conf": 0.8664496541023254, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.42578125, "calib/ece": 0.12795275590551183, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.03773325318907372, "calib/mean_conf": 0.67748031496063, "calib/mu_c": 0.6636645962732919, "calib/mu_w": 0.7013978494623656, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08578740157480316, "calib/std_conf": 0.06818188393949375, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5890122249388753, "calib/step_q_c_n": 2045.0, "calib/step_q_gap": -0.027777823604814067, "calib/step_q_w": 0.6167900485436894, "calib/step_q_w_n": 1648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 824.375, "completions/mean_terminated_length": 830.8661499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.016, "grad_norm": 0.43208855390548706, "kl": 0.005416393280029297, "learning_rate": 3.5e-06, "loss": -0.0519, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019296735525131226, "mask/share_reasoning": 0.7683975696563721, "mask/share_step_conf": 0.2044931948184967, "num_tokens": 4808976.0, "reward": 0.5477840304374695, "reward_std": 0.09553329646587372, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.738040566444397, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.03330870717763901, "step": 15 }, { "adv/mean_abs_final_conf": 0.7689300179481506, "adv/mean_abs_reasoning": 0.3367258608341217, "adv/mean_abs_step_conf": 0.5895552039146423, "adv/ratio_final_to_reasoning": 2.2835490450403566, "adv/ratio_step_to_reasoning": 1.7508462297912715, "adv/std_final_conf": 0.9287763237953186, "adv/std_reasoning": 0.6187803745269775, "adv/std_step_conf": 0.8190042972564697, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 16.66015625, "calib/ece": 0.11865306122448982, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02222402597402595, "calib/mean_conf": 0.6964489795918367, "calib/mu_c": 0.6894642857142858, "calib/mu_w": 0.7116883116883117, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06469387755102042, "calib/std_conf": 0.06712798048701368, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6045779348252847, "calib/step_q_c_n": 2547.0, "calib/step_q_gap": -0.03265402093722991, "calib/step_q_w": 0.6372319557625146, "calib/step_q_w_n": 1718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 1002.92578125, "completions/mean_terminated_length": 1043.695068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.017066666666666667, "grad_norm": 0.24544142186641693, "kl": 0.0057163238525390625, "learning_rate": 3.7500000000000005e-06, "loss": -0.1641, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.015103766694664955, "mask/share_reasoning": 0.7649943828582764, "mask/share_step_conf": 0.18083932995796204, "num_tokens": 5174573.0, "reward": 0.5389186143875122, "reward_std": 0.09756513684988022, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7371909618377686, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.017989974468946457, "step": 16 }, { "adv/mean_abs_final_conf": 0.7784232497215271, "adv/mean_abs_reasoning": 0.2690816819667816, "adv/mean_abs_step_conf": 0.5825159549713135, "adv/ratio_final_to_reasoning": 2.892888300800885, "adv/ratio_step_to_reasoning": 2.164829470046295, "adv/std_final_conf": 0.9275980591773987, "adv/std_reasoning": 0.5483062267303467, "adv/std_step_conf": 0.8175146579742432, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.57421875, "calib/ece": 0.1028286852589641, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0199203187250996, "calib/gap": 0.0046954270923209585, "calib/mean_conf": 0.6889641434262949, "calib/mu_c": 0.6901052631578948, "calib/mu_w": 0.6854098360655738, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01741035856573707, "calib/std_conf": 0.07718479603011531, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6035703592814371, "calib/step_q_c_n": 2672.0, "calib/step_q_gap": -0.022582331936693234, "calib/step_q_w": 0.6261526912181303, "calib/step_q_w_n": 1059.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 861.125, "completions/mean_terminated_length": 878.2789306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.018133333333333335, "grad_norm": 0.2567841708660126, "kl": 0.0070972442626953125, "learning_rate": 4.000000000000001e-06, "loss": -0.0918, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01926950365304947, "mask/share_reasoning": 0.7539726495742798, "mask/share_step_conf": 0.20722657442092896, "num_tokens": 5498549.0, "reward": 0.5851441621780396, "reward_std": 0.06767553091049194, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7914144396781921, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.034342579543590546, "step": 17 }, { "adv/mean_abs_final_conf": 0.766454815864563, "adv/mean_abs_reasoning": 0.32873159646987915, "adv/mean_abs_step_conf": 0.636091947555542, "adv/ratio_final_to_reasoning": 2.3315520141514336, "adv/ratio_step_to_reasoning": 1.934988770128233, "adv/std_final_conf": 0.9293180108070374, "adv/std_reasoning": 0.6187217831611633, "adv/std_step_conf": 0.8517022728919983, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 16.29296875, "calib/ece": 0.1363786008230452, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.00823045267489712, "calib/gap": -0.024285714285714355, "calib/mean_conf": 0.6750617283950617, "calib/mu_c": 0.6666666666666666, "calib/mu_w": 0.690952380952381, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07855967078189298, "calib/std_conf": 0.08319925198683596, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5914795678722404, "calib/step_q_c_n": 2129.0, "calib/step_q_gap": -0.03671529011012986, "calib/step_q_w": 0.6281948579823703, "calib/step_q_w_n": 2042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 871.078125, "completions/mean_terminated_length": 906.48779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.0192, "grad_norm": 0.546768844127655, "kl": 0.009520530700683594, "learning_rate": 4.25e-06, "loss": -0.1916, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01771453768014908, "mask/share_reasoning": 0.7553741931915283, "mask/share_step_conf": 0.18784880638122559, "num_tokens": 5832265.0, "reward": 0.5322065353393555, "reward_std": 0.0910949558019638, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7171124815940857, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.03323813155293465, "step": 18 }, { "adv/mean_abs_final_conf": 0.7309796810150146, "adv/mean_abs_reasoning": 0.2801279127597809, "adv/mean_abs_step_conf": 0.5914919376373291, "adv/ratio_final_to_reasoning": 2.6094496396789073, "adv/ratio_step_to_reasoning": 2.111506603572752, "adv/std_final_conf": 0.913943886756897, "adv/std_reasoning": 0.5725624561309814, "adv/std_step_conf": 0.8334521651268005, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.203125, "calib/ece": 0.13434782608695645, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02455696202531643, "calib/mean_conf": 0.6576679841897233, "calib/mu_c": 0.65, "calib/mu_w": 0.6745569620253165, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05213438735177864, "calib/std_conf": 0.04752050153246474, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5765921244530872, "calib/step_q_c_n": 2057.0, "calib/step_q_gap": -0.04079260721735878, "calib/step_q_w": 0.6173847316704459, "calib/step_q_w_n": 1323.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 736.97265625, "completions/mean_terminated_length": 748.670654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.020266666666666665, "grad_norm": 0.3068307042121887, "kl": 0.014101028442382812, "learning_rate": 4.5e-06, "loss": -0.061, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0203668475151062, "mask/share_reasoning": 0.762851357460022, "mask/share_step_conf": 0.2011568248271942, "num_tokens": 6125690.0, "reward": 0.561945915222168, "reward_std": 0.05958160012960434, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7624972462654114, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.02780073508620262, "step": 19 }, { "adv/mean_abs_final_conf": 0.7632920742034912, "adv/mean_abs_reasoning": 0.22523455321788788, "adv/mean_abs_step_conf": 0.6193887591362, "adv/ratio_final_to_reasoning": 3.3888764547822117, "adv/ratio_step_to_reasoning": 2.749972196925817, "adv/std_final_conf": 0.9254392981529236, "adv/std_reasoning": 0.4959951937198639, "adv/std_step_conf": 0.8340355753898621, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.68359375, "calib/ece": 0.1906890688259109, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.016194331983805668, "calib/gap": -0.012711111111111006, "calib/mean_conf": 0.6513368421052631, "calib/mu_c": 0.647888888888889, "calib/mu_w": 0.6606, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0566404858299595, "calib/std_conf": 0.10182151993567717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583936120244246, "calib/step_q_c_n": 2129.0, "calib/step_q_gap": -0.06751387975575396, "calib/step_q_w": 0.65145, "calib/step_q_w_n": 1630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 769.0859375, "completions/mean_terminated_length": 793.8951416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.021333333333333333, "grad_norm": 1.4331884384155273, "kl": 0.028350830078125, "learning_rate": 4.75e-06, "loss": -0.1062, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.021386535838246346, "mask/share_reasoning": 0.7441344857215881, "mask/share_step_conf": 0.20322901010513306, "num_tokens": 6427448.0, "reward": 0.5562363862991333, "reward_std": 0.06482435017824173, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7534843683242798, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.02539462223649025, "step": 20 }, { "adv/mean_abs_final_conf": 0.7670572996139526, "adv/mean_abs_reasoning": 0.3465600609779358, "adv/mean_abs_step_conf": 0.6293857097625732, "adv/ratio_final_to_reasoning": 2.2133459275412246, "adv/ratio_step_to_reasoning": 1.8160941800002854, "adv/std_final_conf": 0.9293149709701538, "adv/std_reasoning": 0.6187368631362915, "adv/std_step_conf": 0.8505994081497192, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.0859375, "calib/ece": 0.16473895582329326, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.01606425702811245, "calib/gap": -0.034671428571428575, "calib/mean_conf": 0.6607228915662651, "calib/mu_c": 0.6539, "calib/mu_w": 0.6885714285714286, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011124497991967871, "calib/std_conf": 0.07473298867607345, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5910108168593808, "calib/step_q_c_n": 2681.0, "calib/step_q_gap": -0.034198666629188135, "calib/step_q_w": 0.625209483488569, "calib/step_q_w_n": 1181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 776.60546875, "completions/mean_terminated_length": 792.0757446289062, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.0224, "grad_norm": 0.31129884719848633, "kl": 0.020071029663085938, "learning_rate": 5e-06, "loss": -0.1007, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021010365337133408, "mask/share_reasoning": 0.7523726224899292, "mask/share_step_conf": 0.2070857584476471, "num_tokens": 6729219.0, "reward": 0.5802181363105774, "reward_std": 0.08217073231935501, "rewards/accuracy_reward_step": 0.78125, "rewards/final_brier_reward_step": 0.7830750346183777, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.02658003382384777, "step": 21 }, { "adv/mean_abs_final_conf": 0.7307391166687012, "adv/mean_abs_reasoning": 0.18713341653347015, "adv/mean_abs_step_conf": 0.6073583364486694, "adv/ratio_final_to_reasoning": 3.904909824259011, "adv/ratio_step_to_reasoning": 3.2455899523431135, "adv/std_final_conf": 0.9273250699043274, "adv/std_reasoning": 0.4675096869468689, "adv/std_step_conf": 0.8511538505554199, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.73828125, "calib/ece": 0.11175781250000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.0027857142857141692, "calib/mean_conf": 0.6611328125, "calib/mu_c": 0.6617857142857142, "calib/mu_w": 0.659, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003632812499999997, "calib/std_conf": 0.058429026269824526, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.590171009771987, "calib/step_q_c_n": 2456.0, "calib/step_q_gap": 0.0005436805794403199, "calib/step_q_w": 0.5896273291925467, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 750.03515625, "completions/mean_terminated_length": 755.94091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.023466666666666667, "grad_norm": 0.27753156423568726, "kl": 0.022693634033203125, "learning_rate": 4.9722222222222224e-06, "loss": -0.0472, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02132297307252884, "mask/share_reasoning": 0.7679464817047119, "mask/share_step_conf": 0.20291808247566223, "num_tokens": 7023044.0, "reward": 0.5980198979377747, "reward_std": 0.04576648771762848, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.8072237968444824, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.035690926015377045, "step": 22 }, { "adv/mean_abs_final_conf": 0.7114725112915039, "adv/mean_abs_reasoning": 0.31436145305633545, "adv/mean_abs_step_conf": 0.6340088844299316, "adv/ratio_final_to_reasoning": 2.2632307631050548, "adv/ratio_step_to_reasoning": 2.0168149697294897, "adv/std_final_conf": 0.9128880500793457, "adv/std_reasoning": 0.6185219883918762, "adv/std_step_conf": 0.8657233715057373, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 15.53515625, "calib/ece": 0.14125984251968499, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.01323305407463815, "calib/mean_conf": 0.6806299212598425, "calib/mu_c": 0.6779207920792079, "calib/mu_w": 0.691153846153846, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01330708661417321, "calib/std_conf": 0.09799363655618468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6040825998645905, "calib/step_q_c_n": 2954.0, "calib/step_q_gap": -0.004013196811851416, "calib/step_q_w": 0.6080957966764419, "calib/step_q_w_n": 1023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 878.9140625, "completions/mean_terminated_length": 889.3359985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.024533333333333334, "grad_norm": 0.3237517178058624, "kl": 0.019628524780273438, "learning_rate": 4.944444444444445e-06, "loss": -0.0817, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019940588623285294, "mask/share_reasoning": 0.7632123827934265, "mask/share_step_conf": 0.2051282525062561, "num_tokens": 7351982.0, "reward": 0.5972261428833008, "reward_std": 0.07706841081380844, "rewards/accuracy_reward_step": 0.7890625, "rewards/final_brier_reward_step": 0.8038030862808228, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.03439916670322418, "step": 23 }, { "adv/mean_abs_final_conf": 0.7458991408348083, "adv/mean_abs_reasoning": 0.3374817967414856, "adv/mean_abs_step_conf": 0.6277369260787964, "adv/ratio_final_to_reasoning": 2.210190736320438, "adv/ratio_step_to_reasoning": 1.8600615859576246, "adv/std_final_conf": 0.9292863011360168, "adv/std_reasoning": 0.6185944676399231, "adv/std_step_conf": 0.8499169945716858, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.07421875, "calib/ece": 0.06044143426294826, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": 0.0008164342357123155, "calib/mean_conf": 0.667373705179283, "calib/mu_c": 0.6676729559748428, "calib/mu_w": 0.6668565217391305, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04717450199203192, "calib/std_conf": 0.08415062823820862, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.591605249873801, "calib/step_q_c_n": 1981.0, "calib/step_q_gap": -0.015375452962203884, "calib/step_q_w": 0.6069807028360049, "calib/step_q_w_n": 1622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 811.828125, "completions/mean_terminated_length": 821.45458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 251.0, "epoch": 0.0256, "grad_norm": 0.33109045028686523, "kl": 0.024871826171875, "learning_rate": 4.9166666666666665e-06, "loss": -0.0978, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02108951471745968, "mask/share_reasoning": 0.7626346349716187, "mask/share_step_conf": 0.20455709099769592, "num_tokens": 7664322.0, "reward": 0.5434887409210205, "reward_std": 0.08392756432294846, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7451183199882507, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.021546650677919388, "step": 24 }, { "adv/mean_abs_final_conf": 0.707527756690979, "adv/mean_abs_reasoning": 0.19326797127723694, "adv/mean_abs_step_conf": 0.5437524914741516, "adv/ratio_final_to_reasoning": 3.660863991147567, "adv/ratio_step_to_reasoning": 2.8134640617412776, "adv/std_final_conf": 0.927862823009491, "adv/std_reasoning": 0.4959583878517151, "adv/std_step_conf": 0.8003897666931152, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.15234375, "calib/ece": 0.1054183266932271, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.023243804832999815, "calib/mean_conf": 0.6718725099601593, "calib/mu_c": 0.6651123595505618, "calib/mu_w": 0.6883561643835616, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03406374501992032, "calib/std_conf": 0.056836240624110364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5857778776978417, "calib/step_q_c_n": 2224.0, "calib/step_q_gap": -0.036304323874710054, "calib/step_q_w": 0.6220822015725518, "calib/step_q_w_n": 1399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 801.66796875, "completions/mean_terminated_length": 811.1739501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.02666666666666667, "grad_norm": 0.1912597119808197, "kl": 0.023283004760742188, "learning_rate": 4.888888888888889e-06, "loss": -0.0584, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01987135224044323, "mask/share_reasoning": 0.7582444548606873, "mask/share_step_conf": 0.21016541123390198, "num_tokens": 7972773.0, "reward": 0.5612221956253052, "reward_std": 0.057776253670454025, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7643148899078369, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.022973205894231796, "step": 25 }, { "adv/mean_abs_final_conf": 0.7434746026992798, "adv/mean_abs_reasoning": 0.3130928874015808, "adv/mean_abs_step_conf": 0.5985824465751648, "adv/ratio_final_to_reasoning": 2.374613517635361, "adv/ratio_step_to_reasoning": 1.9118366167399004, "adv/std_final_conf": 0.9266886115074158, "adv/std_reasoning": 0.596113920211792, "adv/std_step_conf": 0.8195051550865173, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.53515625, "calib/ece": 0.14855502008032126, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.006396436122017457, "calib/mean_conf": 0.657791967871486, "calib/mu_c": 0.6558139534883721, "calib/mu_w": 0.6622103896103896, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05779196787148594, "calib/std_conf": 0.07257252414506551, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5822783018867925, "calib/step_q_c_n": 2120.0, "calib/step_q_gap": -0.03643306600827312, "calib/step_q_w": 0.6187113678950656, "calib/step_q_w_n": 1601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 760.25390625, "completions/mean_terminated_length": 775.3984375, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.027733333333333332, "grad_norm": 0.3038398325443268, "kl": 0.026044845581054688, "learning_rate": 4.861111111111111e-06, "loss": -0.0717, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019195556640625, "mask/share_reasoning": 0.758608341217041, "mask/share_step_conf": 0.2026648223400116, "num_tokens": 8272638.0, "reward": 0.5579166412353516, "reward_std": 0.07703261822462082, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.756049633026123, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.030877456068992615, "step": 26 }, { "adv/mean_abs_final_conf": 0.7445192933082581, "adv/mean_abs_reasoning": 0.31904906034469604, "adv/mean_abs_step_conf": 0.5985069274902344, "adv/ratio_final_to_reasoning": 2.3335573924082085, "adv/ratio_step_to_reasoning": 1.8759087610025118, "adv/std_final_conf": 0.9297001361846924, "adv/std_reasoning": 0.6184704899787903, "adv/std_step_conf": 0.8344622850418091, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.73828125, "calib/ece": 0.09807843137254894, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.016436713836477934, "calib/mean_conf": 0.6729803921568628, "calib/mu_c": 0.6667924528301887, "calib/mu_w": 0.6832291666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07376470588235291, "calib/std_conf": 0.06798889476679937, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.58775, "calib/step_q_c_n": 2000.0, "calib/step_q_gap": -0.016983025708635413, "calib/step_q_w": 0.6047330257086354, "calib/step_q_w_n": 1517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 833.25, "completions/mean_terminated_length": 839.81103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.0288, "grad_norm": 0.3839709460735321, "kl": 0.03032684326171875, "learning_rate": 4.833333333333333e-06, "loss": -0.0129, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01979503408074379, "mask/share_reasoning": 0.7735280990600586, "mask/share_step_conf": 0.19886434078216553, "num_tokens": 8591166.0, "reward": 0.5505564212799072, "reward_std": 0.07114443182945251, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7475433945655823, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.03013189509510994, "step": 27 }, { "adv/mean_abs_final_conf": 0.7383830547332764, "adv/mean_abs_reasoning": 0.15154343843460083, "adv/mean_abs_step_conf": 0.505989670753479, "adv/ratio_final_to_reasoning": 4.8724185115538905, "adv/ratio_step_to_reasoning": 3.338908473901632, "adv/std_final_conf": 0.9208484292030334, "adv/std_reasoning": 0.40508344769477844, "adv/std_step_conf": 0.7664840817451477, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.31640625, "calib/ece": 0.20844563492063484, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.01984126984126984, "calib/gap": -0.02344361702127673, "calib/mean_conf": 0.6589353174603175, "calib/mu_c": 0.6529813829787233, "calib/mu_w": 0.676425, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06067460317460317, "calib/std_conf": 0.11771983312008158, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5964008502024291, "calib/step_q_c_n": 2470.0, "calib/step_q_gap": -0.01219032829515887, "calib/step_q_w": 0.6085911784975879, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 883.35546875, "completions/mean_terminated_length": 893.830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.029866666666666666, "grad_norm": 2.493471622467041, "kl": 0.038482666015625, "learning_rate": 4.805555555555556e-06, "loss": -0.0604, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019157718867063522, "mask/share_reasoning": 0.7706682682037354, "mask/share_step_conf": 0.19845527410507202, "num_tokens": 8924249.0, "reward": 0.5710136890411377, "reward_std": 0.06168290600180626, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7680135369300842, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.03026391752064228, "step": 28 }, { "adv/mean_abs_final_conf": 0.7686499357223511, "adv/mean_abs_reasoning": 0.3789054751396179, "adv/mean_abs_step_conf": 0.6669013500213623, "adv/ratio_final_to_reasoning": 2.028606040699521, "adv/ratio_step_to_reasoning": 1.7600731416605278, "adv/std_final_conf": 0.9301630854606628, "adv/std_reasoning": 0.6613036394119263, "adv/std_step_conf": 0.8817446231842041, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 17.3125, "calib/ece": 0.07533306772908363, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0199203187250996, "calib/gap": -0.01679394573531534, "calib/mean_conf": 0.6844278884462152, "calib/mu_c": 0.6789414201183432, "calib/mu_w": 0.6957353658536586, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04322709163346618, "calib/std_conf": 0.122758852859182, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6094287381473377, "calib/step_q_c_n": 2742.0, "calib/step_q_gap": -0.00011741569881607017, "calib/step_q_w": 0.6095461538461537, "calib/step_q_w_n": 1690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 988.06640625, "completions/mean_terminated_length": 1003.7500610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.030933333333333334, "grad_norm": 20.593656539916992, "kl": 0.20651626586914062, "learning_rate": 4.777777777777778e-06, "loss": -0.0883, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.016685109585523605, "mask/share_reasoning": 0.7725826501846313, "mask/share_step_conf": 0.19510728120803833, "num_tokens": 9284322.0, "reward": 0.5559663772583008, "reward_std": 0.09881217032670975, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7426596283912659, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.041148170828819275, "step": 29 }, { "adv/mean_abs_final_conf": 0.7729416489601135, "adv/mean_abs_reasoning": 0.3953835964202881, "adv/mean_abs_step_conf": 0.6607516407966614, "adv/ratio_final_to_reasoning": 1.9549158234133863, "adv/ratio_step_to_reasoning": 1.6711660442642395, "adv/std_final_conf": 0.9272094368934631, "adv/std_reasoning": 0.6613385677337646, "adv/std_step_conf": 0.8670967817306519, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.7734375, "calib/ece": 0.06759196787148586, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.008032128514056224, "calib/gap": -0.005014390021296, "calib/mean_conf": 0.697310843373494, "calib/mu_c": 0.6957803468208092, "calib/mu_w": 0.7007947368421052, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03506184738955824, "calib/std_conf": 0.08357829459976067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6052986811481769, "calib/step_q_c_n": 2578.0, "calib/step_q_gap": -0.002377277755932772, "calib/step_q_w": 0.6076759589041096, "calib/step_q_w_n": 1460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 943.890625, "completions/mean_terminated_length": 962.6932373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.032, "grad_norm": 23.295793533325195, "kl": 0.13172149658203125, "learning_rate": 4.75e-06, "loss": -0.0768, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.016622882336378098, "mask/share_reasoning": 0.765550434589386, "mask/share_step_conf": 0.1982954442501068, "num_tokens": 9632942.0, "reward": 0.5647997260093689, "reward_std": 0.10322060436010361, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7575246095657349, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.04238741099834442, "step": 30 }, { "adv/mean_abs_final_conf": 0.7606886625289917, "adv/mean_abs_reasoning": 0.40039771795272827, "adv/mean_abs_step_conf": 0.6468129754066467, "adv/ratio_final_to_reasoning": 1.8998326624299093, "adv/ratio_step_to_reasoning": 1.6154262284856746, "adv/std_final_conf": 0.9288567304611206, "adv/std_reasoning": 0.6613115072250366, "adv/std_step_conf": 0.8345862030982971, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 16.53125, "calib/ece": 0.0926899598393575, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.012048192771084338, "calib/gap": 0.00555919241573033, "calib/mean_conf": 0.6913261044176706, "calib/mu_c": 0.6933131250000001, "calib/mu_w": 0.6877539325842698, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0707228915662651, "calib/std_conf": 0.10581848025908569, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6108364389233955, "calib/step_q_c_n": 2415.0, "calib/step_q_gap": -0.018585080063946213, "calib/step_q_w": 0.6294215189873417, "calib/step_q_w_n": 1817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 919.84765625, "completions/mean_terminated_length": 945.706787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.03306666666666667, "grad_norm": 77.55889892578125, "kl": 0.10741424560546875, "learning_rate": 4.722222222222222e-06, "loss": -0.0953, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017139755189418793, "mask/share_reasoning": 0.7630516290664673, "mask/share_step_conf": 0.1924648880958557, "num_tokens": 9974335.0, "reward": 0.5456604957580566, "reward_std": 0.09380966424942017, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.738542914390564, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.033246852457523346, "step": 31 }, { "adv/mean_abs_final_conf": 0.7483454942703247, "adv/mean_abs_reasoning": 0.3946504294872284, "adv/mean_abs_step_conf": 0.591101884841919, "adv/ratio_final_to_reasoning": 1.896223691540522, "adv/ratio_step_to_reasoning": 1.497785991541783, "adv/std_final_conf": 0.9297779202461243, "adv/std_reasoning": 0.6815653443336487, "adv/std_step_conf": 0.832304060459137, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 16.9609375, "calib/ece": 0.15730119047619043, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": -0.043936311514572335, "calib/mean_conf": 0.6955559523809524, "calib/mu_c": 0.6796900621118013, "calib/mu_w": 0.7236263736263736, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10698412698412699, "calib/std_conf": 0.07975371616637646, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6061186650185414, "calib/step_q_c_n": 2427.0, "calib/step_q_gap": -0.008288489028456025, "calib/step_q_w": 0.6144071540469974, "calib/step_q_w_n": 1915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 909.0703125, "completions/mean_terminated_length": 916.2283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.034133333333333335, "grad_norm": 0.46288660168647766, "kl": 0.03807830810546875, "learning_rate": 4.694444444444445e-06, "loss": -0.0434, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01737382635474205, "mask/share_reasoning": 0.7727394104003906, "mask/share_step_conf": 0.20207428932189941, "num_tokens": 10313761.0, "reward": 0.5478274822235107, "reward_std": 0.09904163330793381, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7278913855552673, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.045107267796993256, "step": 32 }, { "adv/mean_abs_final_conf": 0.7612197995185852, "adv/mean_abs_reasoning": 0.37524527311325073, "adv/mean_abs_step_conf": 0.6346988677978516, "adv/ratio_final_to_reasoning": 2.0285926407628474, "adv/ratio_step_to_reasoning": 1.6914240185679743, "adv/std_final_conf": 0.9312652945518494, "adv/std_reasoning": 0.6612553000450134, "adv/std_step_conf": 0.85466468334198, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 16.16015625, "calib/ece": 0.10518458498023714, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.03162055335968379, "calib/gap": 0.027722876498175952, "calib/mean_conf": 0.693801185770751, "calib/mu_c": 0.7048684210526316, "calib/mu_w": 0.6771455445544556, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09909762845849801, "calib/std_conf": 0.13993269485543056, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6107536945812807, "calib/step_q_c_n": 2030.0, "calib/step_q_gap": -0.03945717395217907, "calib/step_q_w": 0.6502108685334598, "calib/step_q_w_n": 2107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 864.5703125, "completions/mean_terminated_length": 871.3779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.0352, "grad_norm": 0.9508217573165894, "kl": 0.06719207763671875, "learning_rate": 4.666666666666667e-06, "loss": 0.0206, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01880577951669693, "mask/share_reasoning": 0.7738245129585266, "mask/share_step_conf": 0.19955721497535706, "num_tokens": 10641963.0, "reward": 0.5473966002464294, "reward_std": 0.10497646033763885, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7364916801452637, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.04189526289701462, "step": 33 }, { "adv/mean_abs_final_conf": 0.753960371017456, "adv/mean_abs_reasoning": 0.4716504216194153, "adv/mean_abs_step_conf": 0.6549442410469055, "adv/ratio_final_to_reasoning": 1.5985576105894859, "adv/ratio_step_to_reasoning": 1.3886221892861867, "adv/std_final_conf": 0.9314026832580566, "adv/std_reasoning": 0.7206087708473206, "adv/std_step_conf": 0.8729121685028076, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 15.7265625, "calib/ece": 0.15539529411764708, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.11764705882352941, "calib/gap": 0.06722285351755264, "calib/mean_conf": 0.7493890196078431, "calib/mu_c": 0.7710057803468209, "calib/mu_w": 0.7037829268292682, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1131764705882353, "calib/std_conf": 0.23408845495277225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6559398144712429, "calib/step_q_c_n": 2695.0, "calib/step_q_gap": 0.018178131526088936, "calib/step_q_w": 0.637761682945154, "calib/step_q_w_n": 1331.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 820.75, "completions/mean_terminated_length": 827.2125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 389.0, "epoch": 0.03626666666666667, "grad_norm": 1.8004989624023438, "kl": 0.1200408935546875, "learning_rate": 4.638888888888889e-06, "loss": 0.0104, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018822304904460907, "mask/share_reasoning": 0.7638715505599976, "mask/share_step_conf": 0.20949357748031616, "num_tokens": 10957187.0, "reward": 0.5783343315124512, "reward_std": 0.14782200753688812, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.748401403427124, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.07389220595359802, "step": 34 }, { "adv/mean_abs_final_conf": 0.7771700620651245, "adv/mean_abs_reasoning": 0.39579248428344727, "adv/mean_abs_step_conf": 0.5836151838302612, "adv/ratio_final_to_reasoning": 1.9635796356066055, "adv/ratio_step_to_reasoning": 1.4745484237449655, "adv/std_final_conf": 0.9326081275939941, "adv/std_reasoning": 0.6815245747566223, "adv/std_step_conf": 0.8389482498168945, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 17.5078125, "calib/ece": 0.27989444444444433, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.1984126984126984, "calib/gap": -0.0006252383850678944, "calib/mean_conf": 0.7447087301587302, "calib/mu_c": 0.7444779874213836, "calib/mu_w": 0.7451032258064515, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19682539682539676, "calib/std_conf": 0.26671001448866943, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.657295349777598, "calib/step_q_c_n": 2473.0, "calib/step_q_gap": -0.004080707962571206, "calib/step_q_w": 0.6613760577401692, "calib/step_q_w_n": 2009.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 963.65234375, "completions/mean_terminated_length": 975.0791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.037333333333333336, "grad_norm": 2.1660313606262207, "kl": 0.1023101806640625, "learning_rate": 4.611111111111112e-06, "loss": -0.0413, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.016110636293888092, "mask/share_reasoning": 0.7811132669448853, "mask/share_step_conf": 0.19105733931064606, "num_tokens": 11313138.0, "reward": 0.5301204919815063, "reward_std": 0.15779337286949158, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6721141338348389, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.06703317910432816, "step": 35 }, { "adv/mean_abs_final_conf": 0.7380247116088867, "adv/mean_abs_reasoning": 0.2726476490497589, "adv/mean_abs_step_conf": 0.647517740726471, "adv/ratio_final_to_reasoning": 2.706880892540523, "adv/ratio_step_to_reasoning": 2.3749250836499867, "adv/std_final_conf": 0.9322901964187622, "adv/std_reasoning": 0.5728190541267395, "adv/std_step_conf": 0.8712092041969299, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 17.12109375, "calib/ece": 0.13891155378486053, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1593625498007968, "calib/gap": 0.11752367346938775, "calib/mean_conf": 0.7103314741035857, "calib/mu_c": 0.7360836734693877, "calib/mu_w": 0.61856, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.034183266932270896, "calib/std_conf": 0.2845890761017935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6628022576361222, "calib/step_q_c_n": 3012.0, "calib/step_q_gap": 0.016343906067923797, "calib/step_q_w": 0.6464583515681984, "calib/step_q_w_n": 1371.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 851.8046875, "completions/mean_terminated_length": 872.248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.0384, "grad_norm": 1.1923136711120605, "kl": 0.125030517578125, "learning_rate": 4.583333333333333e-06, "loss": -0.0812, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018202677369117737, "mask/share_reasoning": 0.7517971396446228, "mask/share_step_conf": 0.20656266808509827, "num_tokens": 11633912.0, "reward": 0.5910356044769287, "reward_std": 0.1539851725101471, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7678468823432922, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.0650056004524231, "step": 36 }, { "adv/mean_abs_final_conf": 0.7172503471374512, "adv/mean_abs_reasoning": 0.4631299674510956, "adv/mean_abs_step_conf": 0.5714775323867798, "adv/ratio_final_to_reasoning": 1.5487020869863912, "adv/ratio_step_to_reasoning": 1.2339463488661533, "adv/std_final_conf": 0.9181895852088928, "adv/std_reasoning": 0.7393516302108765, "adv/std_step_conf": 0.8393474817276001, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 16.50390625, "calib/ece": 0.20361517857142847, "calib/final_conf_rate": 0.875, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 0.17410714285714285, "calib/gap": 0.03333103686437, "calib/mean_conf": 0.7635276785714286, "calib/mu_c": 0.7755804195804195, "calib/mu_w": 0.7422493827160495, "calib/nonempty_final_conf_rate": 0.875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1643749999999999, "calib/std_conf": 0.24160783129818242, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6688296014330497, "calib/step_q_c_n": 2233.0, "calib/step_q_gap": 0.03221454119208578, "calib/step_q_w": 0.6366150602409639, "calib/step_q_w_n": 1992.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 849.45703125, "completions/mean_terminated_length": 862.9405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.039466666666666664, "grad_norm": 3.8515422344207764, "kl": 0.1292877197265625, "learning_rate": 4.555555555555556e-06, "loss": -0.1164, "mask/has_final_conf_rate": 0.875, "mask/share_final_conf": 0.017217468470335007, "mask/share_reasoning": 0.7670263648033142, "mask/share_step_conf": 0.2001311331987381, "num_tokens": 11958469.0, "reward": 0.4886449873447418, "reward_std": 0.14419779181480408, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6216946840286255, "rewards/format_reward_step": 0.875, "rewards/step_margin_reward": 0.06106396019458771, "step": 37 }, { "adv/mean_abs_final_conf": 0.8081904649734497, "adv/mean_abs_reasoning": 0.7108105421066284, "adv/mean_abs_step_conf": 0.5965287089347839, "adv/ratio_final_to_reasoning": 1.136998422361909, "adv/ratio_step_to_reasoning": 0.8392232157486754, "adv/std_final_conf": 0.9355867505073547, "adv/std_reasoning": 0.8904671669006348, "adv/std_step_conf": 0.8587121367454529, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 20.08984375, "calib/ece": 0.3537238888888889, "calib/final_conf_rate": 0.703125, "calib/format_rate": 0.703125, "calib/frac_conf_gt_0.9": 0.32222222222222224, "calib/gap": 0.1414482142857143, "calib/mean_conf": 0.5383872222222224, "calib/mu_c": 0.5918232142857143, "calib/mu_w": 0.45037499999999997, "calib/nonempty_final_conf_rate": 0.703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13494444444444448, "calib/std_conf": 0.4416079143350978, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6499626506024097, "calib/step_q_c_n": 2573.0, "calib/step_q_gap": 0.15248167783976374, "calib/step_q_w": 0.4974809727626459, "calib/step_q_w_n": 2570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 922.26953125, "completions/mean_terminated_length": 952.0201416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.04053333333333333, "grad_norm": 13.913761138916016, "kl": 0.1562652587890625, "learning_rate": 4.527777777777778e-06, "loss": -0.2601, "mask/has_final_conf_rate": 0.703125, "mask/share_final_conf": 0.012419766746461391, "mask/share_reasoning": 0.7490566968917847, "mask/share_step_conf": 0.20727355778217316, "num_tokens": 12301458.0, "reward": 0.3639047145843506, "reward_std": 0.27264875173568726, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.4425402283668518, "rewards/format_reward_step": 0.703125, "rewards/step_margin_reward": 0.023550385609269142, "step": 38 }, { "adv/mean_abs_final_conf": 0.7847657799720764, "adv/mean_abs_reasoning": 0.34654319286346436, "adv/mean_abs_step_conf": 0.6521954536437988, "adv/ratio_final_to_reasoning": 2.264554018469117, "adv/ratio_step_to_reasoning": 1.8820033608357714, "adv/std_final_conf": 0.9348078966140747, "adv/std_reasoning": 0.640278697013855, "adv/std_step_conf": 0.8904418349266052, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 16.54296875, "calib/ece": 0.4314366533864542, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3147410358565737, "calib/gap": 0.1220060851648353, "calib/mean_conf": 0.39740796812749, "calib/mu_c": 0.4416412500000001, "calib/mu_w": 0.3196351648351648, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09569721115537849, "calib/std_conf": 0.45804881324482766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6229245873889124, "calib/step_q_c_n": 2363.0, "calib/step_q_gap": 0.10884029251711758, "calib/step_q_w": 0.5140842948717949, "calib/step_q_w_n": 1872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 879.80078125, "completions/mean_terminated_length": 890.2332153320312, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.0416, "grad_norm": 2.39874267578125, "kl": 0.152252197265625, "learning_rate": 4.5e-06, "loss": -0.0102, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018277311697602272, "mask/share_reasoning": 0.7705078125, "mask/share_step_conf": 0.19949612021446228, "num_tokens": 12632775.0, "reward": 0.42221030592918396, "reward_std": 0.2368159294128418, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.5469609498977661, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": -0.023634128272533417, "step": 39 }, { "adv/mean_abs_final_conf": 0.7526578307151794, "adv/mean_abs_reasoning": 0.359994113445282, "adv/mean_abs_step_conf": 0.6284192800521851, "adv/ratio_final_to_reasoning": 2.0907503834214256, "adv/ratio_step_to_reasoning": 1.7456376551215549, "adv/std_final_conf": 0.9165927171707153, "adv/std_reasoning": 0.6402466893196106, "adv/std_step_conf": 0.8590297102928162, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 18.49609375, "calib/ece": 0.6018401606425703, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.20080321285140562, "calib/gap": -0.12125155160628848, "calib/mean_conf": 0.23783855421686748, "calib/mu_c": 0.19157792207792207, "calib/mu_w": 0.31282947368421055, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11060240963855418, "calib/std_conf": 0.39857486560681676, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5107157802042973, "calib/step_q_c_n": 2839.0, "calib/step_q_gap": 0.027086455309782542, "calib/step_q_w": 0.48362932489451477, "calib/step_q_w_n": 1896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 946.015625, "completions/mean_terminated_length": 957.2332153320312, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.042666666666666665, "grad_norm": 1.3916919231414795, "kl": 0.1547088623046875, "learning_rate": 4.472222222222223e-06, "loss": -0.0372, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.016756486147642136, "mask/share_reasoning": 0.7632251977920532, "mask/share_step_conf": 0.20829957723617554, "num_tokens": 12981715.0, "reward": 0.31116634607315063, "reward_std": 0.21380087733268738, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.3920474648475647, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": -0.08846484124660492, "step": 40 }, { "adv/mean_abs_final_conf": 0.7882683277130127, "adv/mean_abs_reasoning": 0.29561924934387207, "adv/mean_abs_step_conf": 0.6299017667770386, "adv/ratio_final_to_reasoning": 2.6664986446673447, "adv/ratio_step_to_reasoning": 2.130787383349047, "adv/std_final_conf": 0.930815577507019, "adv/std_reasoning": 0.5726768374443054, "adv/std_step_conf": 0.8430304527282715, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 16.1875, "calib/ece": 0.4181775590551181, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3228346456692913, "calib/gap": 0.17358806348506556, "calib/mean_conf": 0.4596964566929133, "calib/mu_c": 0.4890834123222748, "calib/mu_w": 0.31549534883720926, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02358267716535427, "calib/std_conf": 0.45717941819434066, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6162898284313725, "calib/step_q_c_n": 3264.0, "calib/step_q_gap": 0.06190641934046337, "calib/step_q_w": 0.5543834090909091, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 831.1953125, "completions/mean_terminated_length": 837.7401733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 386.0, "epoch": 0.04373333333333333, "grad_norm": 4.049104690551758, "kl": 0.1920166015625, "learning_rate": 4.444444444444444e-06, "loss": 0.0009, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018943600356578827, "mask/share_reasoning": 0.7585512399673462, "mask/share_step_conf": 0.2146926373243332, "num_tokens": 13301749.0, "reward": 0.45164954662323, "reward_std": 0.23738747835159302, "rewards/accuracy_reward_step": 0.828125, "rewards/final_brier_reward_step": 0.5571421384811401, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": -0.017905592918395996, "step": 41 }, { "adv/mean_abs_final_conf": 0.7633222937583923, "adv/mean_abs_reasoning": 0.2732507884502411, "adv/mean_abs_step_conf": 0.5937932729721069, "adv/ratio_final_to_reasoning": 2.7934861527303267, "adv/ratio_step_to_reasoning": 2.1730706664739836, "adv/std_final_conf": 0.9305040836334229, "adv/std_reasoning": 0.5726051926612854, "adv/std_step_conf": 0.840979814529419, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 15.49609375, "calib/ece": 0.3167365079365079, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4007936507936508, "calib/gap": 0.004208311229001005, "calib/mean_conf": 0.6858031746031745, "calib/mu_c": 0.6871057471264368, "calib/mu_w": 0.6828974358974358, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15603174603174602, "calib/std_conf": 0.375200081318845, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6613030564545127, "calib/step_q_c_n": 2781.0, "calib/step_q_gap": 0.02169175797221934, "calib/step_q_w": 0.6396112984822934, "calib/step_q_w_n": 1186.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 765.09765625, "completions/mean_terminated_length": 777.2421264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.0448, "grad_norm": 249.845458984375, "kl": 8.14959716796875, "learning_rate": 4.416666666666667e-06, "loss": 0.0797, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0191387627273798, "mask/share_reasoning": 0.7546572089195251, "mask/share_step_conf": 0.2105790227651596, "num_tokens": 13601982.0, "reward": 0.5088764429092407, "reward_std": 0.18659836053848267, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.6371692419052124, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.04542739316821098, "step": 42 }, { "adv/mean_abs_final_conf": 0.7796351909637451, "adv/mean_abs_reasoning": 0.3929508924484253, "adv/mean_abs_step_conf": 0.6887763142585754, "adv/ratio_final_to_reasoning": 1.9840524756310918, "adv/ratio_step_to_reasoning": 1.7528305126549042, "adv/std_final_conf": 0.9343047142028809, "adv/std_reasoning": 0.6612555384635925, "adv/std_step_conf": 0.9042664766311646, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 17.1484375, "calib/ece": 0.25004566929133853, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2677165354330709, "calib/gap": 0.02996012250161173, "calib/mean_conf": 0.6705055118110237, "calib/mu_c": 0.6782904255319149, "calib/mu_w": 0.6483303030303031, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09019685039370078, "calib/std_conf": 0.34314449547205267, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6246479186834463, "calib/step_q_c_n": 3099.0, "calib/step_q_gap": 0.030246601874770795, "calib/step_q_w": 0.5944013168086755, "calib/step_q_w_n": 1291.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 878.5, "completions/mean_terminated_length": 885.4172973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.04586666666666667, "grad_norm": 1.552886724472046, "kl": 0.198394775390625, "learning_rate": 4.388888888888889e-06, "loss": -0.0167, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01802043803036213, "mask/share_reasoning": 0.7663673162460327, "mask/share_step_conf": 0.2077997326850891, "num_tokens": 13932102.0, "reward": 0.5526186227798462, "reward_std": 0.169440358877182, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.6911579370498657, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.06876671314239502, "step": 43 }, { "adv/mean_abs_final_conf": 0.7641512155532837, "adv/mean_abs_reasoning": 0.4402768015861511, "adv/mean_abs_step_conf": 0.6271167993545532, "adv/ratio_final_to_reasoning": 1.7356154419227525, "adv/ratio_step_to_reasoning": 1.4243693901093315, "adv/std_final_conf": 0.9358059167861938, "adv/std_reasoning": 0.7206077575683594, "adv/std_step_conf": 0.8419582843780518, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 18.640625, "calib/ece": 0.18296857142857148, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.19591836734693877, "calib/gap": 0.03705228070175448, "calib/mean_conf": 0.7537661224489797, "calib/mu_c": 0.7681333333333333, "calib/mu_w": 0.7310810526315789, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16224489795918373, "calib/std_conf": 0.1857926899203027, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.600721026802567, "calib/step_q_c_n": 2649.0, "calib/step_q_gap": -0.003705916202614401, "calib/step_q_w": 0.6044269430051814, "calib/step_q_w_n": 2123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 859.12890625, "completions/mean_terminated_length": 894.0527954101562, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.046933333333333334, "grad_norm": 1.1150966882705688, "kl": 0.16571044921875, "learning_rate": 4.361111111111112e-06, "loss": -0.1341, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016945868730545044, "mask/share_reasoning": 0.7451803684234619, "mask/share_step_conf": 0.19881124794483185, "num_tokens": 14258359.0, "reward": 0.5474046468734741, "reward_std": 0.16996341943740845, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6944642066955566, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.09018873423337936, "step": 44 }, { "adv/mean_abs_final_conf": 0.756545901298523, "adv/mean_abs_reasoning": 0.42394763231277466, "adv/mean_abs_step_conf": 0.7025113105773926, "adv/ratio_final_to_reasoning": 1.78452677556262, "adv/ratio_step_to_reasoning": 1.6570709612056584, "adv/std_final_conf": 0.935492992401123, "adv/std_reasoning": 0.7015060782432556, "adv/std_step_conf": 0.9042750000953674, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 18.265625, "calib/ece": 0.11956734693877547, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.11020408163265306, "calib/gap": 0.00505797101449279, "calib/mean_conf": 0.7135755102040817, "calib/mu_c": 0.715, "calib/mu_w": 0.7099420289855072, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05738775510204078, "calib/std_conf": 0.17422795123655282, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5407490272373541, "calib/step_q_c_n": 3084.0, "calib/step_q_gap": -0.02061278180787207, "calib/step_q_w": 0.5613618090452261, "calib/step_q_w_n": 1592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3049.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 859.15625, "completions/mean_terminated_length": 890.4615478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.048, "grad_norm": 7.584715366363525, "kl": 0.45855712890625, "learning_rate": 4.333333333333334e-06, "loss": -0.0599, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01749027520418167, "mask/share_reasoning": 0.7472882866859436, "mask/share_step_conf": 0.20006519556045532, "num_tokens": 14583351.0, "reward": 0.5705052018165588, "reward_std": 0.15893591940402985, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7362944483757019, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.07346594333648682, "step": 45 }, { "adv/mean_abs_final_conf": 0.7510257959365845, "adv/mean_abs_reasoning": 0.5484082102775574, "adv/mean_abs_step_conf": 0.7050764560699463, "adv/ratio_final_to_reasoning": 1.369464901986933, "adv/ratio_step_to_reasoning": 1.2856781551703917, "adv/std_final_conf": 0.9359838366508484, "adv/std_reasoning": 0.7928950190544128, "adv/std_step_conf": 0.9049521088600159, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 18.79296875, "calib/ece": 0.12113692946058094, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.11618257261410789, "calib/gap": 0.01165860576205413, "calib/mean_conf": 0.7049211618257263, "calib/mu_c": 0.7091298701298702, "calib/mu_w": 0.697471264367816, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09352697095435689, "calib/std_conf": 0.1649222060048462, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5419610073111292, "calib/step_q_c_n": 2462.0, "calib/step_q_gap": -0.0590053613563889, "calib/step_q_w": 0.6009663686675181, "calib/step_q_w_n": 2349.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 825.8671875, "completions/mean_terminated_length": 873.6445922851562, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.04906666666666667, "grad_norm": 1.147646188735962, "kl": 0.178741455078125, "learning_rate": 4.305555555555556e-06, "loss": -0.1523, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018486380577087402, "mask/share_reasoning": 0.7309393882751465, "mask/share_step_conf": 0.19588670134544373, "num_tokens": 14899541.0, "reward": 0.5333684086799622, "reward_std": 0.16860918700695038, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6996122598648071, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.058530814945697784, "step": 46 }, { "adv/mean_abs_final_conf": 0.7626035809516907, "adv/mean_abs_reasoning": 0.2406388521194458, "adv/mean_abs_step_conf": 0.6646240949630737, "adv/ratio_final_to_reasoning": 3.1690791999504615, "adv/ratio_step_to_reasoning": 2.761915165025698, "adv/std_final_conf": 0.9349892735481262, "adv/std_reasoning": 0.5227157473564148, "adv/std_step_conf": 0.8726968169212341, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 17.01171875, "calib/ece": 0.12545816733067722, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.11553784860557768, "calib/gap": 0.0007207207207207134, "calib/mean_conf": 0.7364541832669322, "calib/mu_c": 0.7366666666666666, "calib/mu_w": 0.7359459459459459, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07836653386454182, "calib/std_conf": 0.1407088517775483, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.520901875901876, "calib/step_q_c_n": 2772.0, "calib/step_q_gap": -0.04521941279048036, "calib/step_q_w": 0.5661212886923563, "calib/step_q_w_n": 1583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 861.109375, "completions/mean_terminated_length": 871.3201904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.050133333333333335, "grad_norm": 2.5522289276123047, "kl": 0.23779296875, "learning_rate": 4.277777777777778e-06, "loss": -0.0587, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017462948337197304, "mask/share_reasoning": 0.7714071273803711, "mask/share_step_conf": 0.19941113889217377, "num_tokens": 15225961.0, "reward": 0.59224933385849, "reward_std": 0.1048864796757698, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7565503716468811, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.09201083332300186, "step": 47 }, { "adv/mean_abs_final_conf": 0.7653645277023315, "adv/mean_abs_reasoning": 0.5050486922264099, "adv/mean_abs_step_conf": 0.6120218634605408, "adv/ratio_final_to_reasoning": 1.5154272043124584, "adv/ratio_step_to_reasoning": 1.2118076393041635, "adv/std_final_conf": 0.9359065890312195, "adv/std_reasoning": 0.7753804922103882, "adv/std_step_conf": 0.8563839197158813, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 16.02734375, "calib/ece": 0.2003212851405623, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.08835341365461848, "calib/gap": -0.043063204451419645, "calib/mean_conf": 0.6985542168674699, "calib/mu_c": 0.683680981595092, "calib/mu_w": 0.7267441860465117, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12212851405622488, "calib/std_conf": 0.14105568011199188, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5007367475292004, "calib/step_q_c_n": 2226.0, "calib/step_q_gap": -0.08999846824064506, "calib/step_q_w": 0.5907352157698454, "calib/step_q_w_n": 1877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 774.2265625, "completions/mean_terminated_length": 789.6494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.0512, "grad_norm": 1.180692434310913, "kl": 0.198089599609375, "learning_rate": 4.25e-06, "loss": -0.0692, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02045733667910099, "mask/share_reasoning": 0.7637689113616943, "mask/share_step_conf": 0.19624248147010803, "num_tokens": 15527851.0, "reward": 0.5507139563560486, "reward_std": 0.15430349111557007, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7125750184059143, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.066977858543396, "step": 48 }, { "adv/mean_abs_final_conf": 0.7673805356025696, "adv/mean_abs_reasoning": 0.40805861353874207, "adv/mean_abs_step_conf": 0.6758687496185303, "adv/ratio_final_to_reasoning": 1.8805644829985009, "adv/ratio_step_to_reasoning": 1.6563031074317995, "adv/std_final_conf": 0.9355793595314026, "adv/std_reasoning": 0.6613188982009888, "adv/std_step_conf": 0.8738494515419006, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.1015625, "calib/ece": 0.11163968253968251, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.05952380952380952, "calib/gap": 0.008185054945054948, "calib/mean_conf": 0.6840999999999999, "calib/mu_c": 0.6863736263736264, "calib/mu_w": 0.6781885714285715, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03675873015873016, "calib/std_conf": 0.14843343492345182, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5284627035213935, "calib/step_q_c_n": 2641.0, "calib/step_q_gap": 0.004620744337719973, "calib/step_q_w": 0.5238419591836735, "calib/step_q_w_n": 1225.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 781.9296875, "completions/mean_terminated_length": 791.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.05226666666666667, "grad_norm": 1.0000989437103271, "kl": 0.19378662109375, "learning_rate": 4.222222222222223e-06, "loss": -0.0547, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019703930243849754, "mask/share_reasoning": 0.7719682455062866, "mask/share_step_conf": 0.1966090351343155, "num_tokens": 15832561.0, "reward": 0.5932475924491882, "reward_std": 0.12978488206863403, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7670062780380249, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.07964511215686798, "step": 49 }, { "adv/mean_abs_final_conf": 0.7690557837486267, "adv/mean_abs_reasoning": 0.3387686312198639, "adv/mean_abs_step_conf": 0.6234382390975952, "adv/ratio_final_to_reasoning": 2.270150518303162, "adv/ratio_step_to_reasoning": 1.8403068691828737, "adv/std_final_conf": 0.9357212781906128, "adv/std_reasoning": 0.6402995586395264, "adv/std_step_conf": 0.8528716564178467, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.671875, "calib/ece": 0.16543999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.08, "calib/gap": -0.022112266112266132, "calib/mean_conf": 0.6725599999999999, "calib/mu_c": 0.6668108108108107, "calib/mu_w": 0.6889230769230769, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04900000000000003, "calib/std_conf": 0.14565935054091103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5091788743253663, "calib/step_q_c_n": 2594.0, "calib/step_q_gap": -0.013506151492189589, "calib/step_q_w": 0.5226850258175559, "calib/step_q_w_n": 1162.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1994.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 748.32421875, "completions/mean_terminated_length": 754.216552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.05333333333333334, "grad_norm": 0.9171644449234009, "kl": 0.2061767578125, "learning_rate": 4.194444444444445e-06, "loss": -0.0888, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02141934633255005, "mask/share_reasoning": 0.7716706991195679, "mask/share_step_conf": 0.19909745454788208, "num_tokens": 16129492.0, "reward": 0.5873520970344543, "reward_std": 0.12604889273643494, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7552015781402588, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.07887758314609528, "step": 50 }, { "adv/mean_abs_final_conf": 0.758726954460144, "adv/mean_abs_reasoning": 0.36322999000549316, "adv/mean_abs_step_conf": 0.6598608493804932, "adv/ratio_final_to_reasoning": 2.0888334535611164, "adv/ratio_step_to_reasoning": 1.8166474892960078, "adv/std_final_conf": 0.9354365468025208, "adv/std_reasoning": 0.6610972285270691, "adv/std_step_conf": 0.8887801766395569, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 14.765625, "calib/ece": 0.1910546875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.11328125, "calib/gap": -0.06335106382978706, "calib/mean_conf": 0.7034765625, "calib/mu_c": 0.6866489361702128, "calib/mu_w": 0.7499999999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08007812499999997, "calib/std_conf": 0.14585424364132704, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5035037065938354, "calib/step_q_c_n": 2563.0, "calib/step_q_gap": -0.04075266152448831, "calib/step_q_w": 0.5442563681183237, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 803.07421875, "completions/mean_terminated_length": 809.3976440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.0544, "grad_norm": 1.2030285596847534, "kl": 0.195037841796875, "learning_rate": 4.166666666666667e-06, "loss": 0.0091, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020157014951109886, "mask/share_reasoning": 0.7813325524330139, "mask/share_step_conf": 0.19069793820381165, "num_tokens": 16444375.0, "reward": 0.6015352606773376, "reward_std": 0.12200477719306946, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7579878568649292, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.09820760786533356, "step": 51 }, { "adv/mean_abs_final_conf": 0.7669674158096313, "adv/mean_abs_reasoning": 0.18372583389282227, "adv/mean_abs_step_conf": 0.705299973487854, "adv/ratio_final_to_reasoning": 4.174521348244619, "adv/ratio_step_to_reasoning": 3.8388720766361883, "adv/std_final_conf": 0.9351452589035034, "adv/std_reasoning": 0.4374311864376068, "adv/std_step_conf": 0.903425395488739, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.734375, "calib/ece": 0.16539062499999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.09765625, "calib/gap": 0.0019213250517599034, "calib/mean_conf": 0.67875, "calib/mu_c": 0.6790952380952382, "calib/mu_w": 0.6771739130434783, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011914062499999978, "calib/std_conf": 0.1553499074669824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5023640167364016, "calib/step_q_c_n": 2868.0, "calib/step_q_gap": 0.03392265871171024, "calib/step_q_w": 0.4684413580246914, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 760.84375, "completions/mean_terminated_length": 766.8346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 289.0, "epoch": 0.055466666666666664, "grad_norm": 0.9788868427276611, "kl": 0.2137451171875, "learning_rate": 4.138888888888889e-06, "loss": -0.0505, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021420970559120178, "mask/share_reasoning": 0.7817922234535217, "mask/share_step_conf": 0.1889742910861969, "num_tokens": 16747103.0, "reward": 0.6501138806343079, "reward_std": 0.08566803485155106, "rewards/accuracy_reward_step": 0.8203125, "rewards/final_brier_reward_step": 0.8089929819107056, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.12717223167419434, "step": 52 }, { "adv/mean_abs_final_conf": 0.7650182247161865, "adv/mean_abs_reasoning": 0.36879515647888184, "adv/mean_abs_step_conf": 0.6698992252349854, "adv/ratio_final_to_reasoning": 2.0743716702255375, "adv/ratio_step_to_reasoning": 1.816453425340323, "adv/std_final_conf": 0.9354962706565857, "adv/std_reasoning": 0.6402270197868347, "adv/std_step_conf": 0.8737857341766357, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.5703125, "calib/ece": 0.20097656250000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.08203125, "calib/gap": -0.05537704918032793, "calib/mean_conf": 0.7151953124999999, "calib/mu_c": 0.702, "calib/mu_w": 0.7573770491803279, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07722656250000005, "calib/std_conf": 0.14481656277176083, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5216705336426914, "calib/step_q_c_n": 2586.0, "calib/step_q_gap": -0.020367754645596814, "calib/step_q_w": 0.5420382882882883, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 784.01953125, "completions/mean_terminated_length": 790.1929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.05653333333333333, "grad_norm": 0.5744272470474243, "kl": 0.21148681640625, "learning_rate": 4.111111111111111e-06, "loss": -0.0426, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019791677594184875, "mask/share_reasoning": 0.7987407445907593, "mask/share_step_conf": 0.17365510761737823, "num_tokens": 17053636.0, "reward": 0.6078897714614868, "reward_std": 0.12828344106674194, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7752581834793091, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.08817760646343231, "step": 53 }, { "adv/mean_abs_final_conf": 0.7916830778121948, "adv/mean_abs_reasoning": 0.2069225311279297, "adv/mean_abs_step_conf": 0.5848709344863892, "adv/ratio_final_to_reasoning": 3.8259877911639184, "adv/ratio_step_to_reasoning": 2.826521265220718, "adv/std_final_conf": 0.9353046417236328, "adv/std_reasoning": 0.4676089584827423, "adv/std_step_conf": 0.8257472515106201, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.6328125, "calib/ece": 0.17996093750000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.07421875, "calib/gap": 0.02836110082685428, "calib/mean_conf": 0.6812890625000001, "calib/mu_c": 0.6853881278538814, "calib/mu_w": 0.6570270270270271, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.002890624999999994, "calib/std_conf": 0.15071022756558725, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49881886642885453, "calib/step_q_c_n": 2523.0, "calib/step_q_gap": 0.010269415879403943, "calib/step_q_w": 0.4885494505494506, "calib/step_q_w_n": 455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 663.92578125, "completions/mean_terminated_length": 669.153564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.0576, "grad_norm": 2.6270413398742676, "kl": 0.2523193359375, "learning_rate": 4.083333333333334e-06, "loss": 0.0061, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.025035999715328217, "mask/share_reasoning": 0.7858097553253174, "mask/share_step_conf": 0.1813417375087738, "num_tokens": 17329833.0, "reward": 0.6468451023101807, "reward_std": 0.09642485529184341, "rewards/accuracy_reward_step": 0.85546875, "rewards/final_brier_reward_step": 0.8303191661834717, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.09227728843688965, "step": 54 }, { "adv/mean_abs_final_conf": 0.7900725603103638, "adv/mean_abs_reasoning": 0.38732653856277466, "adv/mean_abs_step_conf": 0.6862166523933411, "adv/ratio_final_to_reasoning": 2.0398100353309907, "adv/ratio_step_to_reasoning": 1.7716747603705052, "adv/std_final_conf": 0.9358355402946472, "adv/std_reasoning": 0.6612837910652161, "adv/std_step_conf": 0.9048165678977966, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.78125, "calib/ece": 0.17815999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.092, "calib/gap": -0.0014392982922452058, "calib/mean_conf": 0.7173600000000001, "calib/mu_c": 0.716778523489933, "calib/mu_w": 0.7182178217821782, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14975999999999998, "calib/std_conf": 0.14797915528884464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4989834660134722, "calib/step_q_c_n": 1633.0, "calib/step_q_gap": -0.048270957415447846, "calib/step_q_w": 0.5472544234289201, "calib/step_q_w_n": 1639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 703.51171875, "completions/mean_terminated_length": 717.5259399414062, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.058666666666666666, "grad_norm": 27.244579315185547, "kl": 0.24969482421875, "learning_rate": 4.055555555555556e-06, "loss": -0.0733, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.022532660514116287, "mask/share_reasoning": 0.7766234278678894, "mask/share_step_conf": 0.181312695145607, "num_tokens": 17617756.0, "reward": 0.5515916347503662, "reward_std": 0.1400744915008545, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7049773931503296, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.08648727834224701, "step": 55 }, { "adv/mean_abs_final_conf": 0.7837743163108826, "adv/mean_abs_reasoning": 0.4821297526359558, "adv/mean_abs_step_conf": 0.6652582883834839, "adv/ratio_final_to_reasoning": 1.6256501740988614, "adv/ratio_step_to_reasoning": 1.3798324719565769, "adv/std_final_conf": 0.9359019994735718, "adv/std_reasoning": 0.7206350564956665, "adv/std_step_conf": 0.8749900460243225, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.25, "calib/ece": 0.19551732283464568, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2283464566929134, "calib/gap": -0.008955006894740158, "calib/mean_conf": 0.7592070866141732, "calib/mu_c": 0.7557872611464969, "calib/mu_w": 0.764742268041237, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1683070866141732, "calib/std_conf": 0.1681407624961063, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5358365802011646, "calib/step_q_c_n": 1889.0, "calib/step_q_gap": -0.03024425380043927, "calib/step_q_w": 0.5660808340016039, "calib/step_q_w_n": 1247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2348.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 704.9140625, "completions/mean_terminated_length": 710.4645385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 244.0, "epoch": 0.05973333333333333, "grad_norm": 16.545696258544922, "kl": 0.248779296875, "learning_rate": 4.027777777777779e-06, "loss": -0.0361, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023570137098431587, "mask/share_reasoning": 0.7856115102767944, "mask/share_step_conf": 0.18300580978393555, "num_tokens": 17905054.0, "reward": 0.5521728992462158, "reward_std": 0.17187048494815826, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7059837579727173, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.07726814597845078, "step": 56 }, { "adv/mean_abs_final_conf": 0.7717324495315552, "adv/mean_abs_reasoning": 0.3633062541484833, "adv/mean_abs_step_conf": 0.6912558078765869, "adv/ratio_final_to_reasoning": 2.124192580555324, "adv/ratio_step_to_reasoning": 1.9026807273019601, "adv/std_final_conf": 0.9351680874824524, "adv/std_reasoning": 0.640233039855957, "adv/std_step_conf": 0.8745987415313721, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.82421875, "calib/ece": 0.15373545816733064, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2908366533864542, "calib/gap": -0.017319962894248753, "calib/mean_conf": 0.8026788844621514, "calib/mu_c": 0.7988836734693877, "calib/mu_w": 0.8162036363636365, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0877689243027888, "calib/std_conf": 0.15399733011199934, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5754106598984772, "calib/step_q_c_n": 2364.0, "calib/step_q_gap": -0.03773406262600598, "calib/step_q_w": 0.6131447225244832, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 663.44921875, "completions/mean_terminated_length": 676.6653442382812, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.0608, "grad_norm": 61.37419509887695, "kl": 0.268646240234375, "learning_rate": 4.000000000000001e-06, "loss": -0.0286, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.022753167897462845, "mask/share_reasoning": 0.7694884538650513, "mask/share_step_conf": 0.188227117061615, "num_tokens": 18181689.0, "reward": 0.6276879906654358, "reward_std": 0.1472686231136322, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7831728458404541, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.1229843869805336, "step": 57 }, { "adv/mean_abs_final_conf": 0.7331936359405518, "adv/mean_abs_reasoning": 0.4673541188240051, "adv/mean_abs_step_conf": 0.6613420248031616, "adv/ratio_final_to_reasoning": 1.5688181753602042, "adv/ratio_step_to_reasoning": 1.4150769152677733, "adv/std_final_conf": 0.9347412586212158, "adv/std_reasoning": 0.7393353581428528, "adv/std_step_conf": 0.8752701282501221, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.54296875, "calib/ece": 0.20596787148594387, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.357429718875502, "calib/gap": 0.03512728550295863, "calib/mean_conf": 0.8160401606425703, "calib/mu_c": 0.8273260355029587, "calib/mu_w": 0.7921987500000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1716465863453816, "calib/std_conf": 0.19137137908397986, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.639537118491921, "calib/step_q_c_n": 2228.0, "calib/step_q_gap": -0.017584834696133922, "calib/step_q_w": 0.6571219531880549, "calib/step_q_w_n": 1239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 772.9765625, "completions/mean_terminated_length": 794.706787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.06186666666666667, "grad_norm": 9.372441291809082, "kl": 0.243499755859375, "learning_rate": 3.972222222222223e-06, "loss": -0.0652, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020479410886764526, "mask/share_reasoning": 0.7819588780403137, "mask/share_step_conf": 0.17021796107292175, "num_tokens": 18485891.0, "reward": 0.573380708694458, "reward_std": 0.17932750284671783, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7214945554733276, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.09870444983243942, "step": 58 }, { "adv/mean_abs_final_conf": 0.7504489421844482, "adv/mean_abs_reasoning": 0.4621698260307312, "adv/mean_abs_step_conf": 0.6386820673942566, "adv/ratio_final_to_reasoning": 1.6237514868280225, "adv/ratio_step_to_reasoning": 1.3819207387022028, "adv/std_final_conf": 0.9361416101455688, "adv/std_reasoning": 0.7392333149909973, "adv/std_step_conf": 0.8436469435691833, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.9453125, "calib/ece": 0.2620114173228346, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": -0.013952311401020934, "calib/mean_conf": 0.8032641732283463, "calib/mu_c": 0.7987598837209302, "calib/mu_w": 0.8127121951219511, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1940551181102362, "calib/std_conf": 0.26044774079447564, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6302592832254853, "calib/step_q_c_n": 2009.0, "calib/step_q_gap": -0.0004373802635517787, "calib/step_q_w": 0.6306966634890371, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 716.1640625, "completions/mean_terminated_length": 724.6561279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.06293333333333333, "grad_norm": 8.109827041625977, "kl": 0.267791748046875, "learning_rate": 3.944444444444445e-06, "loss": -0.0357, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.023571684956550598, "mask/share_reasoning": 0.7911335229873657, "mask/share_step_conf": 0.17357605695724487, "num_tokens": 18775477.0, "reward": 0.5561695098876953, "reward_std": 0.22776079177856445, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6845602989196777, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.09574736654758453, "step": 59 }, { "adv/mean_abs_final_conf": 0.783907413482666, "adv/mean_abs_reasoning": 0.44255656003952026, "adv/mean_abs_step_conf": 0.6574190855026245, "adv/ratio_final_to_reasoning": 1.7713157690232026, "adv/ratio_step_to_reasoning": 1.48550297264584, "adv/std_final_conf": 0.9329642653465271, "adv/std_reasoning": 0.7206140160560608, "adv/std_step_conf": 0.8595014214515686, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 12.2890625, "calib/ece": 0.20925992063492066, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.46825396825396826, "calib/gap": 0.10986593680709522, "calib/mean_conf": 0.8168511904761905, "calib/mu_c": 0.8552170731707316, "calib/mu_w": 0.7453511363636364, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1876587301587302, "calib/std_conf": 0.23684537928585173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6476460933403252, "calib/step_q_c_n": 1907.0, "calib/step_q_gap": 0.016674503348396263, "calib/step_q_w": 0.6309715899919289, "calib/step_q_w_n": 1239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 708.765625, "completions/mean_terminated_length": 720.0159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.064, "grad_norm": 10.051220893859863, "kl": 0.273529052734375, "learning_rate": 3.916666666666667e-06, "loss": -0.0035, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022933753207325935, "mask/share_reasoning": 0.7813419103622437, "mask/share_step_conf": 0.18009933829307556, "num_tokens": 19065777.0, "reward": 0.5763288736343384, "reward_std": 0.19373396039009094, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7193988561630249, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.11138393729925156, "step": 60 }, { "adv/mean_abs_final_conf": 0.7766209244728088, "adv/mean_abs_reasoning": 0.39034169912338257, "adv/mean_abs_step_conf": 0.601711630821228, "adv/ratio_final_to_reasoning": 1.9895925191106159, "adv/ratio_step_to_reasoning": 1.5414997479709023, "adv/std_final_conf": 0.9335206151008606, "adv/std_reasoning": 0.6815312504768372, "adv/std_step_conf": 0.8435513377189636, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.609375, "calib/ece": 0.31292980392156877, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5137254901960784, "calib/gap": -0.034817016777103915, "calib/mean_conf": 0.7993839215686274, "calib/mu_c": 0.7881878612716764, "calib/mu_w": 0.8230048780487803, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21694117647058836, "calib/std_conf": 0.28078543733590494, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6056639582124204, "calib/step_q_c_n": 1723.0, "calib/step_q_gap": -0.015997975322322944, "calib/step_q_w": 0.6216619335347433, "calib/step_q_w_n": 993.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 617.234375, "completions/mean_terminated_length": 622.094482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.06506666666666666, "grad_norm": 44.98173904418945, "kl": 0.3096923828125, "learning_rate": 3.88888888888889e-06, "loss": -0.0122, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.027154112234711647, "mask/share_reasoning": 0.7856699228286743, "mask/share_step_conf": 0.17936351895332336, "num_tokens": 19327853.0, "reward": 0.5646814107894897, "reward_std": 0.22981145977973938, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6698556542396545, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.1282571256160736, "step": 61 }, { "adv/mean_abs_final_conf": 0.7539002299308777, "adv/mean_abs_reasoning": 0.48199620842933655, "adv/mean_abs_step_conf": 0.6898258924484253, "adv/ratio_final_to_reasoning": 1.564120664740465, "adv/ratio_step_to_reasoning": 1.4311853088976276, "adv/std_final_conf": 0.9354040622711182, "adv/std_reasoning": 0.739365816116333, "adv/std_step_conf": 0.8912215232849121, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.515625, "calib/ece": 0.2386551587301588, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.0037250000000000894, "calib/mean_conf": 0.8140440476190476, "calib/mu_c": 0.8151083333333334, "calib/mu_w": 0.8113833333333333, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16920674603174612, "calib/std_conf": 0.27045332652723747, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5826730943738656, "calib/step_q_c_n": 2204.0, "calib/step_q_gap": -0.03697860562613453, "calib/step_q_w": 0.6196517000000001, "calib/step_q_w_n": 1000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 687.2734375, "completions/mean_terminated_length": 698.1825561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.06613333333333334, "grad_norm": 37.392425537109375, "kl": 0.27716064453125, "learning_rate": 3.861111111111112e-06, "loss": -0.0545, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02452751062810421, "mask/share_reasoning": 0.7799397706985474, "mask/share_step_conf": 0.1799076795578003, "num_tokens": 19610875.0, "reward": 0.5806684494018555, "reward_std": 0.24473421275615692, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7016507387161255, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.12296748161315918, "step": 62 }, { "adv/mean_abs_final_conf": 0.7358971834182739, "adv/mean_abs_reasoning": 0.432090163230896, "adv/mean_abs_step_conf": 0.6293022036552429, "adv/ratio_final_to_reasoning": 1.7031102442038992, "adv/ratio_step_to_reasoning": 1.4564140941087862, "adv/std_final_conf": 0.9351003170013428, "adv/std_reasoning": 0.720492422580719, "adv/std_step_conf": 0.8755343556404114, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 11.20703125, "calib/ece": 0.27219724409448814, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4448818897637795, "calib/gap": 0.007648740310077673, "calib/mean_conf": 0.8210311023622047, "calib/mu_c": 0.8236208333333334, "calib/mu_w": 0.8159720930232557, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21590551181102358, "calib/std_conf": 0.23959935279423172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5948911862527716, "calib/step_q_c_n": 1804.0, "calib/step_q_gap": 0.0020115618396261414, "calib/step_q_w": 0.5928796244131455, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 703.6953125, "completions/mean_terminated_length": 712.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.0672, "grad_norm": 1.3926122188568115, "kl": 0.2791748046875, "learning_rate": 3.833333333333334e-06, "loss": 0.0205, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02410535141825676, "mask/share_reasoning": 0.7952975034713745, "mask/share_step_conf": 0.168878436088562, "num_tokens": 19899661.0, "reward": 0.5846147537231445, "reward_std": 0.2183077484369278, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6880985498428345, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.15300604701042175, "step": 63 }, { "adv/mean_abs_final_conf": 0.7396208047866821, "adv/mean_abs_reasoning": 0.36050862073898315, "adv/mean_abs_step_conf": 0.6242177486419678, "adv/ratio_final_to_reasoning": 2.0516036572733864, "adv/ratio_step_to_reasoning": 1.7314918776766681, "adv/std_final_conf": 0.9338341951370239, "adv/std_reasoning": 0.6403324007987976, "adv/std_step_conf": 0.8599122762680054, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 11.1875, "calib/ece": 0.2487636000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.596, "calib/gap": -0.030148054807927593, "calib/mean_conf": 0.8462763999999999, "calib/mu_c": 0.8381967213114754, "calib/mu_w": 0.868344776119403, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18152000000000013, "calib/std_conf": 0.251969499668194, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5824359238291301, "calib/step_q_c_n": 1943.0, "calib/step_q_gap": -0.06967851699605987, "calib/step_q_w": 0.65211444082519, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 611.25, "completions/mean_terminated_length": 625.9200439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.06826666666666667, "grad_norm": 1.2419276237487793, "kl": 0.314239501953125, "learning_rate": 3.8055555555555556e-06, "loss": -0.0402, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.025241907685995102, "mask/share_reasoning": 0.7825311422348022, "mask/share_step_conf": 0.16878946125507355, "num_tokens": 20159917.0, "reward": 0.5861185193061829, "reward_std": 0.2304370403289795, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.6986793279647827, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.13527649641036987, "step": 64 }, { "adv/mean_abs_final_conf": 0.7525701522827148, "adv/mean_abs_reasoning": 0.2651790380477905, "adv/mean_abs_step_conf": 0.6145387291908264, "adv/ratio_final_to_reasoning": 2.8379699912294227, "adv/ratio_step_to_reasoning": 2.3174483689018976, "adv/std_final_conf": 0.9266197085380554, "adv/std_reasoning": 0.572488009929657, "adv/std_step_conf": 0.8432523012161255, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.234375, "calib/ece": 0.29576210937499997, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.64453125, "calib/gap": 0.008698574697613926, "calib/mean_conf": 0.897831640625, "calib/mu_c": 0.9010596273291926, "calib/mu_w": 0.8923610526315787, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28234374999999995, "calib/std_conf": 0.15485483769221853, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5674722730042657, "calib/step_q_c_n": 1641.0, "calib/step_q_gap": 0.014914969633479158, "calib/step_q_w": 0.5525573033707866, "calib/step_q_w_n": 979.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 585.296875, "completions/mean_terminated_length": 589.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.06933333333333333, "grad_norm": 1.0370111465454102, "kl": 0.32586669921875, "learning_rate": 3.777777777777778e-06, "loss": 0.0075, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.027405641973018646, "mask/share_reasoning": 0.785406231880188, "mask/share_step_conf": 0.17937563359737396, "num_tokens": 20414777.0, "reward": 0.5884789824485779, "reward_std": 0.1469491571187973, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6730327606201172, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.17892518639564514, "step": 65 }, { "adv/mean_abs_final_conf": 0.76250159740448, "adv/mean_abs_reasoning": 0.43535977602005005, "adv/mean_abs_step_conf": 0.6619211435317993, "adv/ratio_final_to_reasoning": 1.7514286789998803, "adv/ratio_step_to_reasoning": 1.5204003217360054, "adv/std_final_conf": 0.9311895370483398, "adv/std_reasoning": 0.7207090854644775, "adv/std_step_conf": 0.875627338886261, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 13.01953125, "calib/ece": 0.40874493927125516, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6437246963562753, "calib/gap": -0.01084218811491533, "calib/mean_conf": 0.9076923076923077, "calib/mu_c": 0.9023809523809524, "calib/mu_w": 0.9132231404958677, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4031578947368422, "calib/std_conf": 0.10402009723648364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4740751240255139, "calib/step_q_c_n": 1411.0, "calib/step_q_gap": -0.07345349199946005, "calib/step_q_w": 0.5475286160249739, "calib/step_q_w_n": 1922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 722.234375, "completions/mean_terminated_length": 748.5505981445312, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.0704, "grad_norm": 1.0291537046432495, "kl": 0.26129150390625, "learning_rate": 3.7500000000000005e-06, "loss": -0.1164, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02183469384908676, "mask/share_reasoning": 0.7824565172195435, "mask/share_step_conf": 0.16055256128311157, "num_tokens": 20706021.0, "reward": 0.4741209149360657, "reward_std": 0.22096720337867737, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.554445743560791, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.1039523333311081, "step": 66 }, { "adv/mean_abs_final_conf": 0.7682864665985107, "adv/mean_abs_reasoning": 0.4244164228439331, "adv/mean_abs_step_conf": 0.6096725463867188, "adv/ratio_final_to_reasoning": 1.8102185147557919, "adv/ratio_step_to_reasoning": 1.4364961240222984, "adv/std_final_conf": 0.9361712336540222, "adv/std_reasoning": 0.7204695343971252, "adv/std_step_conf": 0.84321129322052, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.83984375, "calib/ece": 0.28509249011857707, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4268774703557312, "calib/gap": -0.04268690137023479, "calib/mean_conf": 0.8048679841897233, "calib/mu_c": 0.7895141975308642, "calib/mu_w": 0.832201098901099, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22482213438735177, "calib/std_conf": 0.21552196379980912, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.2893720588235294, "calib/step_q_c_n": 1700.0, "calib/step_q_gap": -0.07566049931600544, "calib/step_q_w": 0.36503255813953484, "calib/step_q_w_n": 1075.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 694.09375, "completions/mean_terminated_length": 702.3241577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.07146666666666666, "grad_norm": 1.5884639024734497, "kl": 0.27716064453125, "learning_rate": 3.7222222222222225e-06, "loss": -0.0467, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.023152736946940422, "mask/share_reasoning": 0.803122878074646, "mask/share_step_conf": 0.16200563311576843, "num_tokens": 20988717.0, "reward": 0.5525285005569458, "reward_std": 0.2043268233537674, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6637780070304871, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.11940403282642365, "step": 67 }, { "adv/mean_abs_final_conf": 0.7617519497871399, "adv/mean_abs_reasoning": 0.3916369080543518, "adv/mean_abs_step_conf": 0.5791916251182556, "adv/ratio_final_to_reasoning": 1.9450463787274694, "adv/ratio_step_to_reasoning": 1.4788994939105045, "adv/std_final_conf": 0.9319273233413696, "adv/std_reasoning": 0.6612851023674011, "adv/std_step_conf": 0.8105087876319885, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.60546875, "calib/ece": 0.2744149797570849, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5748987854251012, "calib/gap": -0.0012663043478261526, "calib/mean_conf": 0.8724716599190284, "calib/mu_c": 0.872, "calib/mu_w": 0.8732663043478261, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25967813765182174, "calib/std_conf": 0.15390637173820043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29178073510773134, "calib/step_q_c_n": 1578.0, "calib/step_q_gap": -0.16684094471997857, "calib/step_q_w": 0.4586216798277099, "calib/step_q_w_n": 1393.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 709.953125, "completions/mean_terminated_length": 726.9920043945312, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.07253333333333334, "grad_norm": 1.2024588584899902, "kl": 0.277191162109375, "learning_rate": 3.694444444444445e-06, "loss": -0.0324, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02384016662836075, "mask/share_reasoning": 0.7884659171104431, "mask/share_step_conf": 0.16425639390945435, "num_tokens": 21274553.0, "reward": 0.5623621940612793, "reward_std": 0.1624753475189209, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6578570008277893, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.1543673574924469, "step": 68 }, { "adv/mean_abs_final_conf": 0.7745255827903748, "adv/mean_abs_reasoning": 0.5398683547973633, "adv/mean_abs_step_conf": 0.6580164432525635, "adv/ratio_final_to_reasoning": 1.4346563859648502, "adv/ratio_step_to_reasoning": 1.218846108324957, "adv/std_final_conf": 0.9221952557563782, "adv/std_reasoning": 0.7754866480827332, "adv/std_step_conf": 0.8600015044212341, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 12.41015625, "calib/ece": 0.36751004016064265, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8755020080321285, "calib/gap": 0.01665674603174594, "calib/mean_conf": 0.9458232931726909, "calib/mu_c": 0.9528472222222221, "calib/mu_w": 0.9361904761904761, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36751004016064265, "calib/std_conf": 0.0792600373053975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39045735475896165, "calib/step_q_c_n": 1618.0, "calib/step_q_gap": -0.11180691721024932, "calib/step_q_w": 0.502264271969211, "calib/step_q_w_n": 1559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 766.44921875, "completions/mean_terminated_length": 784.8440551757812, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.0736, "grad_norm": 1.6587779521942139, "kl": 0.2686767578125, "learning_rate": 3.6666666666666666e-06, "loss": -0.0302, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02062370441854, "mask/share_reasoning": 0.8001226186752319, "mask/share_step_conf": 0.15581616759300232, "num_tokens": 21575260.0, "reward": 0.5319957733154297, "reward_std": 0.24367490410804749, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6033320426940918, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.15440955758094788, "step": 69 }, { "adv/mean_abs_final_conf": 0.6887208223342896, "adv/mean_abs_reasoning": 0.3762162923812866, "adv/mean_abs_step_conf": 0.5653591156005859, "adv/ratio_final_to_reasoning": 1.8306512404738886, "adv/ratio_step_to_reasoning": 1.5027502185567427, "adv/std_final_conf": 0.8945690989494324, "adv/std_reasoning": 0.6815224289894104, "adv/std_step_conf": 0.8105819225311279, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 11.78515625, "calib/ece": 0.3446093117408908, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9311740890688259, "calib/gap": 0.0021444281524929387, "calib/mean_conf": 0.9600789473684209, "calib/mu_c": 0.9608863636363637, "calib/mu_w": 0.9587419354838708, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34060323886639693, "calib/std_conf": 0.07443519341229098, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5036967632027257, "calib/step_q_c_n": 1761.0, "calib/step_q_gap": -0.028866134886446293, "calib/step_q_w": 0.532562898089172, "calib/step_q_w_n": 1256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 755.53515625, "completions/mean_terminated_length": 773.6680297851562, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, "epoch": 0.07466666666666667, "grad_norm": 1.2605822086334229, "kl": 0.252716064453125, "learning_rate": 3.638888888888889e-06, "loss": -0.0585, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.021993208676576614, "mask/share_reasoning": 0.7888750433921814, "mask/share_step_conf": 0.1656942367553711, "num_tokens": 21875669.0, "reward": 0.5579054355621338, "reward_std": 0.17114701867103577, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6246556043624878, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.17865526676177979, "step": 70 }, { "adv/mean_abs_final_conf": 0.7037140130996704, "adv/mean_abs_reasoning": 0.6130764484405518, "adv/mean_abs_step_conf": 0.6614584922790527, "adv/ratio_final_to_reasoning": 1.1478405586932403, "adv/ratio_step_to_reasoning": 1.07891682018053, "adv/std_final_conf": 0.918972373008728, "adv/std_reasoning": 0.8588365316390991, "adv/std_step_conf": 0.8758206963539124, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.41796875, "calib/ece": 0.32935714285714296, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9206349206349206, "calib/gap": 0.004954610606784482, "calib/mean_conf": 0.963484126984127, "calib/mu_c": 0.9652732919254657, "calib/mu_w": 0.9603186813186813, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3269761904761906, "calib/std_conf": 0.05988026864873096, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5245230607966457, "calib/step_q_c_n": 1908.0, "calib/step_q_gap": -0.06256584557628908, "calib/step_q_w": 0.5870889063729348, "calib/step_q_w_n": 1271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 736.09375, "completions/mean_terminated_length": 747.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.07573333333333333, "grad_norm": 1.374493956565857, "kl": 0.26165771484375, "learning_rate": 3.6111111111111115e-06, "loss": -0.0009, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022440209984779358, "mask/share_reasoning": 0.7956840991973877, "mask/share_step_conf": 0.16625072062015533, "num_tokens": 22168517.0, "reward": 0.5891203880310059, "reward_std": 0.2593870759010315, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6522749662399292, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.203309565782547, "step": 71 }, { "adv/mean_abs_final_conf": 0.7037005424499512, "adv/mean_abs_reasoning": 0.29564470052719116, "adv/mean_abs_step_conf": 0.602660059928894, "adv/ratio_final_to_reasoning": 2.380223765875452, "adv/ratio_step_to_reasoning": 2.0384605536789113, "adv/std_final_conf": 0.8824068903923035, "adv/std_reasoning": 0.5959694981575012, "adv/std_step_conf": 0.8436238169670105, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.37109375, "calib/ece": 0.40128858267716533, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.952755905511811, "calib/gap": 0.02412757575757596, "calib/mean_conf": 0.9682177165354331, "calib/mu_c": 0.9786666666666667, "calib/mu_w": 0.9545390909090907, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40128858267716533, "calib/std_conf": 0.08601273139317762, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5057326007326007, "calib/step_q_c_n": 1638.0, "calib/step_q_gap": -0.0382893945541235, "calib/step_q_w": 0.5440219952867242, "calib/step_q_w_n": 1273.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 663.0, "completions/mean_terminated_length": 668.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.0768, "grad_norm": 0.9264628887176514, "kl": 0.29205322265625, "learning_rate": 3.5833333333333335e-06, "loss": -0.0476, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02300153858959675, "mask/share_reasoning": 0.7928996086120605, "mask/share_step_conf": 0.17628639936447144, "num_tokens": 22442653.0, "reward": 0.5886973142623901, "reward_std": 0.18019168078899384, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5893194675445557, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2771375775337219, "step": 72 }, { "adv/mean_abs_final_conf": 0.6671165227890015, "adv/mean_abs_reasoning": 0.49489885568618774, "adv/mean_abs_step_conf": 0.6618808507919312, "adv/ratio_final_to_reasoning": 1.3479855835674348, "adv/ratio_step_to_reasoning": 1.3374063067376047, "adv/std_final_conf": 0.8313454985618591, "adv/std_reasoning": 0.7393487691879272, "adv/std_step_conf": 0.859908938407898, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.71484375, "calib/ece": 0.2669388196078431, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9647058823529412, "calib/gap": -0.006764875642823909, "calib/mean_conf": 0.9688337215686275, "calib/mu_c": 0.9670032258064517, "calib/mu_w": 0.9737681014492756, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2531803882352941, "calib/std_conf": 0.11327860894981376, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5308225419664268, "calib/step_q_c_n": 2085.0, "calib/step_q_gap": 0.0013051579401686109, "calib/step_q_w": 0.5295173840262581, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 690.875, "completions/mean_terminated_length": 696.31494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.07786666666666667, "grad_norm": 1.6943904161453247, "kl": 0.269775390625, "learning_rate": 3.555555555555556e-06, "loss": -0.0083, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022829489782452583, "mask/share_reasoning": 0.7974672317504883, "mask/share_step_conf": 0.1718907356262207, "num_tokens": 22726549.0, "reward": 0.6816904544830322, "reward_std": 0.24758538603782654, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.726353645324707, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2932772636413574, "step": 73 }, { "adv/mean_abs_final_conf": 0.684027910232544, "adv/mean_abs_reasoning": 0.42929649353027344, "adv/mean_abs_step_conf": 0.6507533192634583, "adv/ratio_final_to_reasoning": 1.5933694324114185, "adv/ratio_step_to_reasoning": 1.515859852271465, "adv/std_final_conf": 0.8723176717758179, "adv/std_reasoning": 0.7014433145523071, "adv/std_step_conf": 0.8599750399589539, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 11.33984375, "calib/ece": 0.41303135999999985, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.98, "calib/gap": 0.002038238023658656, "calib/mean_conf": 0.9772017599999999, "calib/mu_c": 0.9780741258741259, "calib/mu_w": 0.9760358878504672, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40911655999999985, "calib/std_conf": 0.08859416269316168, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5037612244897959, "calib/step_q_c_n": 1470.0, "calib/step_q_gap": -0.07674526539157189, "calib/step_q_w": 0.5805064898813678, "calib/step_q_w_n": 1433.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 698.56640625, "completions/mean_terminated_length": 715.33203125, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.07893333333333333, "grad_norm": 1.0564448833465576, "kl": 0.2877197265625, "learning_rate": 3.5277777777777784e-06, "loss": -0.0091, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.023499177768826485, "mask/share_reasoning": 0.7832315564155579, "mask/share_step_conf": 0.1698317527770996, "num_tokens": 23009310.0, "reward": 0.5486522912979126, "reward_std": 0.22563649713993073, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.56647127866745, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2253645807504654, "step": 74 }, { "adv/mean_abs_final_conf": 0.5781558156013489, "adv/mean_abs_reasoning": 0.2851184606552124, "adv/mean_abs_step_conf": 0.6577544212341309, "adv/ratio_final_to_reasoning": 2.0277740496800036, "adv/ratio_step_to_reasoning": 2.3069513623305475, "adv/std_final_conf": 0.8220309019088745, "adv/std_reasoning": 0.5959504246711731, "adv/std_step_conf": 0.8598629832267761, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.52734375, "calib/ece": 0.17877470355731245, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9604743083003953, "calib/gap": 0.023058128078817575, "calib/mean_conf": 0.974901185770751, "calib/mu_c": 0.9794581280788176, "calib/mu_w": 0.9564, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17565217391304366, "calib/std_conf": 0.05979532050524667, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5821794439764112, "calib/step_q_c_n": 2374.0, "calib/step_q_gap": -0.08426713465504154, "calib/step_q_w": 0.6664465786314527, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 691.41796875, "completions/mean_terminated_length": 696.8621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.08, "grad_norm": 2.8885269165039062, "kl": 0.262725830078125, "learning_rate": 3.5e-06, "loss": -0.0022, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.022322513163089752, "mask/share_reasoning": 0.7913630604743958, "mask/share_step_conf": 0.1785019338130951, "num_tokens": 23291065.0, "reward": 0.7158107757568359, "reward_std": 0.1600218415260315, "rewards/accuracy_reward_step": 0.796875, "rewards/final_brier_reward_step": 0.8058438301086426, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.26874634623527527, "step": 75 }, { "adv/mean_abs_final_conf": 0.596921443939209, "adv/mean_abs_reasoning": 0.3288942277431488, "adv/mean_abs_step_conf": 0.6491069197654724, "adv/ratio_final_to_reasoning": 1.8149343879801292, "adv/ratio_step_to_reasoning": 1.9736038671751786, "adv/std_final_conf": 0.8286421298980713, "adv/std_reasoning": 0.6186842322349548, "adv/std_step_conf": 0.8600033521652222, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 15.1484375, "calib/ece": 0.30355514403292194, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9670781893004116, "calib/gap": -0.0019850101705521928, "calib/mean_conf": 0.9783699588477366, "calib/mu_c": 0.9777409638554215, "calib/mu_w": 0.9797259740259737, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29939876543209887, "calib/std_conf": 0.05383392961866714, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5899240265906933, "calib/step_q_c_n": 2106.0, "calib/step_q_gap": -0.118399336840458, "calib/step_q_w": 0.7083233634311513, "calib/step_q_w_n": 1772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 768.8359375, "completions/mean_terminated_length": 806.6475219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.08106666666666666, "grad_norm": 1.231719732284546, "kl": 0.2412109375, "learning_rate": 3.4722222222222224e-06, "loss": -0.1814, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02048310451209545, "mask/share_reasoning": 0.7641509771347046, "mask/share_step_conf": 0.1684909164905548, "num_tokens": 23590943.0, "reward": 0.5938359498977661, "reward_std": 0.18622279167175293, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6574385762214661, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.21070200204849243, "step": 76 }, { "adv/mean_abs_final_conf": 0.47663238644599915, "adv/mean_abs_reasoning": 0.3979429006576538, "adv/mean_abs_step_conf": 0.7344629764556885, "adv/ratio_final_to_reasoning": 1.1977406448470382, "adv/ratio_step_to_reasoning": 1.8456491502722885, "adv/std_final_conf": 0.7308207750320435, "adv/std_reasoning": 0.6816669702529907, "adv/std_step_conf": 0.9212200045585632, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.75390625, "calib/ece": 0.26344840873015873, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": -0.001376341886433874, "calib/mean_conf": 0.9881309484126984, "calib/mu_c": 0.9877540928961747, "calib/mu_w": 0.9891304347826085, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2626944404761905, "calib/std_conf": 0.013123445461125982, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5077341389728096, "calib/step_q_c_n": 2317.0, "calib/step_q_gap": -0.10131901717336977, "calib/step_q_w": 0.6090531561461794, "calib/step_q_w_n": 1204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 751.79296875, "completions/mean_terminated_length": 760.70751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.08213333333333334, "grad_norm": 0.4560396671295166, "kl": 0.264556884765625, "learning_rate": 3.444444444444445e-06, "loss": -0.0218, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022371895611286163, "mask/share_reasoning": 0.7850345373153687, "mask/share_step_conf": 0.1808748096227646, "num_tokens": 23888066.0, "reward": 0.6734690070152283, "reward_std": 0.225606769323349, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7203949093818665, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2866993546485901, "step": 77 }, { "adv/mean_abs_final_conf": 0.39663469791412354, "adv/mean_abs_reasoning": 0.3622138798236847, "adv/mean_abs_step_conf": 0.5514669418334961, "adv/ratio_final_to_reasoning": 1.0950289870371448, "adv/ratio_step_to_reasoning": 1.5224898121019945, "adv/std_final_conf": 0.6700455546379089, "adv/std_reasoning": 0.6613717079162598, "adv/std_step_conf": 0.8106377720832825, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 16.52734375, "calib/ece": 0.33469477911646595, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003719503495508425, "calib/mean_conf": 0.9893132530120483, "calib/mu_c": 0.989441717791411, "calib/mu_w": 0.9890697674418601, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33469477911646595, "calib/std_conf": 0.0037830557892607965, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4919075630252101, "calib/step_q_c_n": 2380.0, "calib/step_q_gap": -0.10491037322546526, "calib/step_q_w": 0.5968179362506754, "calib/step_q_w_n": 1851.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 884.49609375, "completions/mean_terminated_length": 909.3613891601562, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.0832, "grad_norm": 0.42842844128608704, "kl": 0.218841552734375, "learning_rate": 3.416666666666667e-06, "loss": -0.1436, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01720147207379341, "mask/share_reasoning": 0.7845550775527954, "mask/share_step_conf": 0.17089968919754028, "num_tokens": 24222521.0, "reward": 0.5937488675117493, "reward_std": 0.1932610720396042, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6439374685287476, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2216852605342865, "step": 78 }, { "adv/mean_abs_final_conf": 0.44898343086242676, "adv/mean_abs_reasoning": 0.3736734390258789, "adv/mean_abs_step_conf": 0.6339391469955444, "adv/ratio_final_to_reasoning": 1.2015395903783577, "adv/ratio_step_to_reasoning": 1.6965057742614689, "adv/std_final_conf": 0.7077057957649231, "adv/std_reasoning": 0.6614980697631836, "adv/std_step_conf": 0.8597406148910522, "calib/answer_extract_rate": 0.91796875, "calib/avg_num_step_conf": 17.51171875, "calib/ece": 0.287361659574468, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000523666666666589, "calib/mean_conf": 0.9894893191489361, "calib/mu_c": 0.9893333333333331, "calib/mu_w": 0.9898569999999997, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.287361659574468, "calib/std_conf": 0.003286667646308783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4385850694444444, "calib/step_q_c_n": 2304.0, "calib/step_q_gap": -0.2518131815371476, "calib/step_q_w": 0.690398250981592, "calib/step_q_w_n": 2179.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 773.8515625, "completions/mean_terminated_length": 843.0042114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.08426666666666667, "grad_norm": 0.5278450846672058, "kl": 0.23162841796875, "learning_rate": 3.3888888888888893e-06, "loss": -0.232, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01737746223807335, "mask/share_reasoning": 0.7424191236495972, "mask/share_step_conf": 0.1581721305847168, "num_tokens": 24527003.0, "reward": 0.5930335521697998, "reward_std": 0.19911476969718933, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6499668955802917, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.22360025346279144, "step": 79 }, { "adv/mean_abs_final_conf": 0.4606902301311493, "adv/mean_abs_reasoning": 0.4467974603176117, "adv/mean_abs_step_conf": 0.6338987350463867, "adv/ratio_final_to_reasoning": 1.0310941109729266, "adv/ratio_step_to_reasoning": 1.4187608286666886, "adv/std_final_conf": 0.7015774250030518, "adv/std_reasoning": 0.7015236616134644, "adv/std_step_conf": 0.8599874377250671, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.4453125, "calib/ece": 0.25608870967741937, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001515151515150137, "calib/mean_conf": 0.9899596774193549, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9898484848484849, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25608870967741937, "calib/std_conf": 0.0010991139653579855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39251846094053633, "calib/step_q_c_n": 2573.0, "calib/step_q_gap": -0.15269515238314224, "calib/step_q_w": 0.5452136133236786, "calib/step_q_w_n": 1381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 751.12890625, "completions/mean_terminated_length": 775.3588256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.08533333333333333, "grad_norm": 0.39397016167640686, "kl": 0.25689697265625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0626, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019908176735043526, "mask/share_reasoning": 0.7659863233566284, "mask/share_step_conf": 0.1828555464744568, "num_tokens": 24821452.0, "reward": 0.6637634634971619, "reward_std": 0.2425382137298584, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7160730361938477, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2755163013935089, "step": 80 }, { "adv/mean_abs_final_conf": 0.4842243194580078, "adv/mean_abs_reasoning": 0.43560612201690674, "adv/mean_abs_step_conf": 0.6790660619735718, "adv/ratio_final_to_reasoning": 1.111610454912785, "adv/ratio_step_to_reasoning": 1.5588992616297892, "adv/std_final_conf": 0.728503406047821, "adv/std_reasoning": 0.7210630178451538, "adv/std_step_conf": 0.8755649924278259, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 18.1484375, "calib/ece": 0.24861637931034497, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.813953488642554e-06, "calib/mean_conf": 0.9899956896551726, "calib/mu_c": 0.9899941860465116, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24861637931034497, "calib/std_conf": 0.0015824954236126742, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3576974088713219, "calib/step_q_c_n": 2277.0, "calib/step_q_gap": -0.26280913397376043, "calib/step_q_w": 0.6205065428450823, "calib/step_q_w_n": 2369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 749.546875, "completions/mean_terminated_length": 823.5364990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 0.0864, "grad_norm": 0.5047710537910461, "kl": 0.24090576171875, "learning_rate": 3.3333333333333333e-06, "loss": -0.308, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.017829496413469315, "mask/share_reasoning": 0.7260985374450684, "mask/share_step_conf": 0.16622823476791382, "num_tokens": 25119584.0, "reward": 0.6293458938598633, "reward_std": 0.24940373003482819, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6764695048332214, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.2665971517562866, "step": 81 }, { "adv/mean_abs_final_conf": 0.4893885850906372, "adv/mean_abs_reasoning": 0.4772290289402008, "adv/mean_abs_step_conf": 0.669119119644165, "adv/ratio_final_to_reasoning": 1.025479498129943, "adv/ratio_step_to_reasoning": 1.4020922430684932, "adv/std_final_conf": 0.7392094731330872, "adv/std_reasoning": 0.7395800352096558, "adv/std_step_conf": 0.8755518198013306, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 17.9296875, "calib/ece": 0.2990042918454936, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003895790200137972, "calib/mean_conf": 0.9899914163090129, "calib/mu_c": 0.9901118012422361, "calib/mu_w": 0.9897222222222223, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2990042918454936, "calib/std_conf": 0.001246424642294338, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35714638694638695, "calib/step_q_c_n": 2145.0, "calib/step_q_gap": -0.2990908318675191, "calib/step_q_w": 0.6562372188139061, "calib/step_q_w_n": 2445.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 714.0234375, "completions/mean_terminated_length": 781.1538696289062, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.08746666666666666, "grad_norm": 0.478674054145813, "kl": 0.247528076171875, "learning_rate": 3.3055555555555558e-06, "loss": -0.2793, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.018443914130330086, "mask/share_reasoning": 0.7258409857749939, "mask/share_step_conf": 0.16977760195732117, "num_tokens": 25407926.0, "reward": 0.5754408836364746, "reward_std": 0.24024444818496704, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6345949172973633, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.2084743082523346, "step": 82 }, { "adv/mean_abs_final_conf": 0.43164581060409546, "adv/mean_abs_reasoning": 0.43753427267074585, "adv/mean_abs_step_conf": 0.6022412776947021, "adv/ratio_final_to_reasoning": 0.9865417124224196, "adv/ratio_step_to_reasoning": 1.3764436646724174, "adv/std_final_conf": 0.7003706693649292, "adv/std_reasoning": 0.7015554308891296, "adv/std_step_conf": 0.8437060713768005, "calib/answer_extract_rate": 0.890625, "calib/avg_num_step_conf": 19.76171875, "calib/ece": 0.3889868421052633, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9956140350877193, "calib/gap": 0.011206521739130726, "calib/mean_conf": 0.9854780701754386, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9787934782608694, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3889868421052633, "calib/std_conf": 0.06553487388041113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40866153846153846, "calib/step_q_c_n": 1950.0, "calib/step_q_gap": -0.22232913378033997, "calib/step_q_w": 0.6309906722418784, "calib/step_q_w_n": 3109.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 850.54296875, "completions/mean_terminated_length": 950.8253173828125, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 0.08853333333333334, "grad_norm": 0.3890380263328552, "kl": 0.208526611328125, "learning_rate": 3.277777777777778e-06, "loss": -0.1941, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.015617813915014267, "mask/share_reasoning": 0.717111349105835, "mask/share_step_conf": 0.16180211305618286, "num_tokens": 25732929.0, "reward": 0.4802950620651245, "reward_std": 0.20812444388866425, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5424792766571045, "rewards/format_reward_step": 0.890625, "rewards/step_margin_reward": 0.13373583555221558, "step": 83 }, { "adv/mean_abs_final_conf": 0.44959962368011475, "adv/mean_abs_reasoning": 0.43477439880371094, "adv/mean_abs_step_conf": 0.6621416807174683, "adv/ratio_final_to_reasoning": 1.0340986610922716, "adv/ratio_step_to_reasoning": 1.5229546232238196, "adv/std_final_conf": 0.7216721177101135, "adv/std_reasoning": 0.7207791805267334, "adv/std_step_conf": 0.8756316900253296, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 15.00390625, "calib/ece": 0.3046721991701246, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.002676794258373416, "calib/mean_conf": 0.989319502074689, "calib/mu_c": 0.9901636363636362, "calib/mu_w": 0.9874868421052628, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3046721991701246, "calib/std_conf": 0.012359148062504485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42497302413629906, "calib/step_q_c_n": 2113.0, "calib/step_q_gap": -0.12437246197481205, "calib/step_q_w": 0.5493454861111111, "calib/step_q_w_n": 1728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 759.96484375, "completions/mean_terminated_length": 797.3401489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.0896, "grad_norm": 0.716595470905304, "kl": 0.25469970703125, "learning_rate": 3.2500000000000002e-06, "loss": -0.1958, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02004670724272728, "mask/share_reasoning": 0.757233202457428, "mask/share_step_conf": 0.17584508657455444, "num_tokens": 26033400.0, "reward": 0.6169133186340332, "reward_std": 0.24483832716941833, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6517096757888794, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.2649293541908264, "step": 84 }, { "adv/mean_abs_final_conf": 0.5315093994140625, "adv/mean_abs_reasoning": 0.5010403394699097, "adv/mean_abs_step_conf": 0.5558913946151733, "adv/ratio_final_to_reasoning": 1.0608115904926707, "adv/ratio_step_to_reasoning": 1.1094743293589793, "adv/std_final_conf": 0.7596035003662109, "adv/std_reasoning": 0.757909893989563, "adv/std_step_conf": 0.7936156988143921, "calib/answer_extract_rate": 0.890625, "calib/avg_num_step_conf": 17.48828125, "calib/ece": 0.2811409691629957, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005741577263316255, "calib/mean_conf": 0.9903920704845816, "calib/mu_c": 0.9905590062111801, "calib/mu_w": 0.9899848484848485, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2811409691629957, "calib/std_conf": 0.0020522961534924387, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45196431838975293, "calib/step_q_c_n": 2186.0, "calib/step_q_gap": -0.1981753586071917, "calib/step_q_w": 0.6501396769969446, "calib/step_q_w_n": 2291.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 742.7890625, "completions/mean_terminated_length": 823.177490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.09066666666666667, "grad_norm": 0.3252770006656647, "kl": 0.23199462890625, "learning_rate": 3.2222222222222227e-06, "loss": -0.2967, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.01761934906244278, "mask/share_reasoning": 0.7188683748245239, "mask/share_step_conf": 0.1658560037612915, "num_tokens": 26331378.0, "reward": 0.570929765701294, "reward_std": 0.2469976842403412, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6339846849441528, "rewards/format_reward_step": 0.88671875, "rewards/step_margin_reward": 0.20396846532821655, "step": 85 }, { "adv/mean_abs_final_conf": 0.5568798184394836, "adv/mean_abs_reasoning": 0.49567070603370667, "adv/mean_abs_step_conf": 0.6302189230918884, "adv/ratio_final_to_reasoning": 1.1234874517712867, "adv/ratio_step_to_reasoning": 1.2714467799293998, "adv/std_final_conf": 0.8091704845428467, "adv/std_reasoning": 0.7755091786384583, "adv/std_step_conf": 0.8597295880317688, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 16.34375, "calib/ece": 0.39535425383542533, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": -0.0038166520979023844, "calib/mean_conf": 0.9881018131101812, "calib/mu_c": 0.9865687645687643, "calib/mu_w": 0.9903854166666667, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3925648535564853, "calib/std_conf": 0.04252623935218979, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4565, "calib/step_q_c_n": 1922.0, "calib/step_q_gap": -0.16944783377542005, "calib/step_q_w": 0.6259478337754201, "calib/step_q_w_n": 2262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 823.42578125, "completions/mean_terminated_length": 878.3208618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.09173333333333333, "grad_norm": 0.5175958871841431, "kl": 0.21533203125, "learning_rate": 3.1944444444444443e-06, "loss": -0.2155, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01790308579802513, "mask/share_reasoning": 0.756227433681488, "mask/share_step_conf": 0.16336949169635773, "num_tokens": 26647687.0, "reward": 0.5359945297241211, "reward_std": 0.23279647529125214, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5600782632827759, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.21425451338291168, "step": 86 }, { "adv/mean_abs_final_conf": 0.5304808616638184, "adv/mean_abs_reasoning": 0.4583980441093445, "adv/mean_abs_step_conf": 0.6418082118034363, "adv/ratio_final_to_reasoning": 1.1572494003427283, "adv/ratio_step_to_reasoning": 1.4001111480535502, "adv/std_final_conf": 0.7625007629394531, "adv/std_reasoning": 0.7397937178611755, "adv/std_step_conf": 0.8598397374153137, "calib/answer_extract_rate": 0.86328125, "calib/avg_num_step_conf": 19.32421875, "calib/ece": 0.16364705882352948, "calib/final_conf_rate": 0.86328125, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00048777681909739634, "calib/mean_conf": 0.9917013574660634, "calib/mu_c": 0.9916174863387976, "calib/mu_w": 0.992105263157895, "calib/nonempty_final_conf_rate": 0.86328125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16364705882352948, "calib/std_conf": 0.0035937915248107737, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4828537754694367, "calib/step_q_c_n": 2503.0, "calib/step_q_gap": -0.2960459790313817, "calib/step_q_w": 0.7788997545008184, "calib/step_q_w_n": 2444.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 702.1953125, "completions/mean_terminated_length": 806.107666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.0928, "grad_norm": 0.983510434627533, "kl": 0.241546630859375, "learning_rate": 3.1666666666666667e-06, "loss": -0.4124, "mask/has_final_conf_rate": 0.86328125, "mask/share_final_conf": 0.017843790352344513, "mask/share_reasoning": 0.6948947906494141, "mask/share_step_conf": 0.1583552062511444, "num_tokens": 26932945.0, "reward": 0.6232060194015503, "reward_std": 0.24949079751968384, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7171169519424438, "rewards/format_reward_step": 0.86328125, "rewards/step_margin_reward": 0.21367013454437256, "step": 87 }, { "adv/mean_abs_final_conf": 0.5327442288398743, "adv/mean_abs_reasoning": 0.4311426281929016, "adv/mean_abs_step_conf": 0.6246463060379028, "adv/ratio_final_to_reasoning": 1.2356565878740111, "adv/ratio_step_to_reasoning": 1.4488159258481486, "adv/std_final_conf": 0.7479230165481567, "adv/std_reasoning": 0.7206476926803589, "adv/std_step_conf": 0.8437591791152954, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.23046875, "calib/ece": 0.24886097154471543, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013218293000238912, "calib/mean_conf": 0.9927634105691057, "calib/mu_c": 0.9927972622950819, "calib/mu_w": 0.9926650793650795, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24886097154471543, "calib/std_conf": 0.004290628973701832, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4629473269519519, "calib/step_q_c_n": 2664.0, "calib/step_q_gap": -0.0537554260844853, "calib/step_q_w": 0.5167027530364372, "calib/step_q_w_n": 1235.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 803.13671875, "completions/mean_terminated_length": 835.7845458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.09386666666666667, "grad_norm": 0.5124315619468689, "kl": 0.22320556640625, "learning_rate": 3.138888888888889e-06, "loss": -0.0945, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018675651401281357, "mask/share_reasoning": 0.7699906826019287, "mask/share_step_conf": 0.17227113246917725, "num_tokens": 27248396.0, "reward": 0.6558331251144409, "reward_std": 0.2364097386598587, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7183858752250671, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2581240236759186, "step": 88 }, { "adv/mean_abs_final_conf": 0.6126490235328674, "adv/mean_abs_reasoning": 0.4717230200767517, "adv/mean_abs_step_conf": 0.5741785764694214, "adv/ratio_final_to_reasoning": 1.2987473527011388, "adv/ratio_step_to_reasoning": 1.217194311136225, "adv/std_final_conf": 0.7811753153800964, "adv/std_reasoning": 0.7207298874855042, "adv/std_step_conf": 0.8108007311820984, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 15.31640625, "calib/ece": 0.39068423236514527, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002062392241379274, "calib/mean_conf": 0.9923439834024896, "calib/mu_c": 0.9931655172413794, "calib/mu_w": 0.9911031250000001, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39068423236514527, "calib/std_conf": 0.00506345370134029, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4329008875739645, "calib/step_q_c_n": 2028.0, "calib/step_q_gap": -0.09478849435947445, "calib/step_q_w": 0.527689381933439, "calib/step_q_w_n": 1893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 875.5390625, "completions/mean_terminated_length": 914.8489379882812, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.09493333333333333, "grad_norm": 0.6323784589767456, "kl": 0.19537353515625, "learning_rate": 3.1111111111111116e-06, "loss": -0.1561, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01836657151579857, "mask/share_reasoning": 0.7712298631668091, "mask/share_step_conf": 0.16743478178977966, "num_tokens": 27581422.0, "reward": 0.5453107357025146, "reward_std": 0.2122250199317932, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5729995965957642, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.21605932712554932, "step": 89 }, { "adv/mean_abs_final_conf": 0.545224666595459, "adv/mean_abs_reasoning": 0.4193390905857086, "adv/mean_abs_step_conf": 0.6760151982307434, "adv/ratio_final_to_reasoning": 1.3001999547285723, "adv/ratio_step_to_reasoning": 1.6120967813578373, "adv/std_final_conf": 0.745468020439148, "adv/std_reasoning": 0.7014575600624084, "adv/std_step_conf": 0.8911756277084351, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 16.046875, "calib/ece": 0.3187395573997232, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.991701244813278, "calib/gap": -0.006903259951430596, "calib/mean_conf": 0.9881724757952973, "calib/mu_c": 0.985966869918699, "calib/mu_w": 0.9928701298701296, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31320705394190856, "calib/std_conf": 0.060135680947406664, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4130003935286401, "calib/step_q_c_n": 2287.0, "calib/step_q_gap": -0.12124067670139232, "calib/step_q_w": 0.5342410702300324, "calib/step_q_w_n": 1821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 819.77734375, "completions/mean_terminated_length": 863.6337280273438, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.096, "grad_norm": 0.7133516669273376, "kl": 0.212066650390625, "learning_rate": 3.0833333333333336e-06, "loss": -0.1441, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018856745213270187, "mask/share_reasoning": 0.7539302110671997, "mask/share_step_conf": 0.1764318197965622, "num_tokens": 27894605.0, "reward": 0.6118367910385132, "reward_std": 0.22222277522087097, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6413779258728027, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.26588934659957886, "step": 90 }, { "adv/mean_abs_final_conf": 0.5682475566864014, "adv/mean_abs_reasoning": 0.44651511311531067, "adv/mean_abs_step_conf": 0.6393174529075623, "adv/ratio_final_to_reasoning": 1.272627823774587, "adv/ratio_step_to_reasoning": 1.431793536498867, "adv/std_final_conf": 0.7608886361122131, "adv/std_reasoning": 0.7014663219451904, "adv/std_step_conf": 0.8439959287643433, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.921875, "calib/ece": 0.3113582995951417, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": -0.010849006875477074, "calib/mean_conf": 0.9835850202429149, "calib/mu_c": 0.9802029411764706, "calib/mu_w": 0.9910519480519476, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3033421052631579, "calib/std_conf": 0.0881249851884996, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3538924684370918, "calib/step_q_c_n": 2297.0, "calib/step_q_gap": -0.08279893011839073, "calib/step_q_w": 0.43669139855548256, "calib/step_q_w_n": 1523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 815.0703125, "completions/mean_terminated_length": 844.7692260742188, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 0.09706666666666666, "grad_norm": 0.8096587061882019, "kl": 0.208343505859375, "learning_rate": 3.055555555555556e-06, "loss": -0.1027, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018055833876132965, "mask/share_reasoning": 0.7757927179336548, "mask/share_step_conf": 0.17099520564079285, "num_tokens": 28210975.0, "reward": 0.618476390838623, "reward_std": 0.23961031436920166, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6538805961608887, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2588534653186798, "step": 91 }, { "adv/mean_abs_final_conf": 0.47137245535850525, "adv/mean_abs_reasoning": 0.3835318684577942, "adv/mean_abs_step_conf": 0.6305508017539978, "adv/ratio_final_to_reasoning": 1.2290307380555456, "adv/ratio_step_to_reasoning": 1.64406364532231, "adv/std_final_conf": 0.7121309041976929, "adv/std_reasoning": 0.6816805005073547, "adv/std_step_conf": 0.8598589301109314, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.89453125, "calib/ece": 0.21645403225806456, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9798387096774194, "calib/gap": 0.03951386569872939, "calib/mean_conf": 0.9825830645161291, "calib/mu_c": 0.9918242105263158, "calib/mu_w": 0.9523103448275864, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21645403225806456, "calib/std_conf": 0.07772657796961607, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3606782115065697, "calib/step_q_c_n": 2613.0, "calib/step_q_gap": -0.00377512182676365, "calib/step_q_w": 0.36445333333333335, "calib/step_q_w_n": 1200.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 802.26171875, "completions/mean_terminated_length": 824.8152465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.09813333333333334, "grad_norm": 0.5272431969642639, "kl": 0.210418701171875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0923, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02028692699968815, "mask/share_reasoning": 0.7759718894958496, "mask/share_step_conf": 0.17639735341072083, "num_tokens": 28523074.0, "reward": 0.6903301477432251, "reward_std": 0.19578728079795837, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7576503753662109, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2808223366737366, "step": 92 }, { "adv/mean_abs_final_conf": 0.6249883770942688, "adv/mean_abs_reasoning": 0.4877223074436188, "adv/mean_abs_step_conf": 0.7049061059951782, "adv/ratio_final_to_reasoning": 1.2814430825814096, "adv/ratio_step_to_reasoning": 1.4453021632123442, "adv/std_final_conf": 0.7965861558914185, "adv/std_reasoning": 0.739631712436676, "adv/std_step_conf": 0.8913946151733398, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 15.8984375, "calib/ece": 0.27326973500697344, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9707112970711297, "calib/gap": 0.04162226972948824, "calib/mean_conf": 0.9731302649930265, "calib/mu_c": 0.9854950396825398, "calib/mu_w": 0.9438727699530516, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27173556485355643, "calib/std_conf": 0.11830748467313208, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3847459764985869, "calib/step_q_c_n": 2241.0, "calib/step_q_gap": -0.03284241424316636, "calib/step_q_w": 0.41758839074175325, "calib/step_q_w_n": 1829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 785.74609375, "completions/mean_terminated_length": 834.6514892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.0992, "grad_norm": 0.7500261664390564, "kl": 0.21484375, "learning_rate": 3e-06, "loss": -0.1888, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01931125298142433, "mask/share_reasoning": 0.7512341141700745, "mask/share_step_conf": 0.17086084187030792, "num_tokens": 28830001.0, "reward": 0.6308143138885498, "reward_std": 0.28138795495033264, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6736418008804321, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.27001798152923584, "step": 93 }, { "adv/mean_abs_final_conf": 0.5569735765457153, "adv/mean_abs_reasoning": 0.3944690227508545, "adv/mean_abs_step_conf": 0.5707467794418335, "adv/ratio_final_to_reasoning": 1.4119577062392001, "adv/ratio_step_to_reasoning": 1.4468735097668632, "adv/std_final_conf": 0.773460865020752, "adv/std_reasoning": 0.681592583656311, "adv/std_step_conf": 0.8110101819038391, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 14.4765625, "calib/ece": 0.32540532503457825, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.983402489626556, "calib/gap": 0.009855063014403309, "calib/mean_conf": 0.9865394882434302, "calib/mu_c": 0.9898517708333333, "calib/mu_w": 0.97999670781893, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3240221991701246, "calib/std_conf": 0.061612208320076564, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.32253509980568806, "calib/step_q_c_n": 1887.0, "calib/step_q_gap": -0.14021302187288992, "calib/step_q_w": 0.462748121678578, "calib/step_q_w_n": 1819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 707.46484375, "completions/mean_terminated_length": 748.3925170898438, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.10026666666666667, "grad_norm": 0.6596551537513733, "kl": 0.230255126953125, "learning_rate": 2.9722222222222225e-06, "loss": -0.131, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.021078769117593765, "mask/share_reasoning": 0.7578392624855042, "mask/share_step_conf": 0.16639444231987, "num_tokens": 29119792.0, "reward": 0.5926636457443237, "reward_std": 0.20104211568832397, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6339141130447388, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.23891310393810272, "step": 94 }, { "adv/mean_abs_final_conf": 0.4981163740158081, "adv/mean_abs_reasoning": 0.33328384160995483, "adv/mean_abs_step_conf": 0.6746397018432617, "adv/ratio_final_to_reasoning": 1.4945710287351954, "adv/ratio_step_to_reasoning": 2.0242196518869906, "adv/std_final_conf": 0.6904189586639404, "adv/std_reasoning": 0.6186611652374268, "adv/std_step_conf": 0.8758255839347839, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.515625, "calib/ece": 0.2768244979919678, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.021064312617702696, "calib/mean_conf": 0.9876678714859438, "calib/mu_c": 0.993758757062147, "calib/mu_w": 0.9726944444444443, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2768244979919678, "calib/std_conf": 0.07046846977775822, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31826907868722815, "calib/step_q_c_n": 2529.0, "calib/step_q_gap": 0.05056060097875048, "calib/step_q_w": 0.26770847770847767, "calib/step_q_w_n": 1443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 846.015625, "completions/mean_terminated_length": 869.7991943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.10133333333333333, "grad_norm": 0.6777894496917725, "kl": 0.204498291015625, "learning_rate": 2.944444444444445e-06, "loss": -0.116, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01932145655155182, "mask/share_reasoning": 0.7791246771812439, "mask/share_step_conf": 0.1742100864648819, "num_tokens": 29442500.0, "reward": 0.6396147608757019, "reward_std": 0.21907824277877808, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7017877101898193, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.24462932348251343, "step": 95 }, { "adv/mean_abs_final_conf": 0.4814407229423523, "adv/mean_abs_reasoning": 0.34680551290512085, "adv/mean_abs_step_conf": 0.6026292443275452, "adv/ratio_final_to_reasoning": 1.388215310966135, "adv/ratio_step_to_reasoning": 1.7376576262569754, "adv/std_final_conf": 0.7257750034332275, "adv/std_reasoning": 0.6612364649772644, "adv/std_step_conf": 0.8277370929718018, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.5546875, "calib/ece": 0.21818557823129242, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9551020408163265, "calib/gap": 0.04885828924162239, "calib/mean_conf": 0.9678454421768707, "calib/mu_c": 0.9790130511463844, "calib/mu_w": 0.930154761904762, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20730122448979582, "calib/std_conf": 0.14024968713784997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37112126843657817, "calib/step_q_c_n": 2260.0, "calib/step_q_gap": -0.0931903050058957, "calib/step_q_w": 0.46431157344247387, "calib/step_q_w_n": 1466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 712.390625, "completions/mean_terminated_length": 741.3495483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.1024, "grad_norm": 1.1299368143081665, "kl": 0.2353515625, "learning_rate": 2.916666666666667e-06, "loss": -0.0482, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02144728973507881, "mask/share_reasoning": 0.761890172958374, "mask/share_step_conf": 0.17759999632835388, "num_tokens": 29730688.0, "reward": 0.6556459665298462, "reward_std": 0.2049870640039444, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7490242719650269, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.223205104470253, "step": 96 }, { "adv/mean_abs_final_conf": 0.6165008544921875, "adv/mean_abs_reasoning": 0.46222391724586487, "adv/mean_abs_step_conf": 0.6652477383613586, "adv/ratio_final_to_reasoning": 1.3337709960262831, "adv/ratio_step_to_reasoning": 1.4392326176568269, "adv/std_final_conf": 0.7992217540740967, "adv/std_reasoning": 0.7207860946655273, "adv/std_step_conf": 0.8758837580680847, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 14.47265625, "calib/ece": 0.310992638888889, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9541666666666667, "calib/gap": 0.04828318001939369, "calib/mean_conf": 0.9623815277777779, "calib/mu_c": 0.9782747412008281, "calib/mu_w": 0.9299915611814344, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30127041666666676, "calib/std_conf": 0.15611747531455863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4241068054850178, "calib/step_q_c_n": 1969.0, "calib/step_q_gap": 0.025339830638627603, "calib/step_q_w": 0.3987669748463902, "calib/step_q_w_n": 1736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 701.59765625, "completions/mean_terminated_length": 745.265625, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.10346666666666667, "grad_norm": 1.2831897735595703, "kl": 0.230224609375, "learning_rate": 2.888888888888889e-06, "loss": -0.1498, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02083716168999672, "mask/share_reasoning": 0.7508844137191772, "mask/share_step_conf": 0.16968463361263275, "num_tokens": 30015369.0, "reward": 0.647125244140625, "reward_std": 0.24315395951271057, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6479384899139404, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.3330307602882385, "step": 97 }, { "adv/mean_abs_final_conf": 0.5435746908187866, "adv/mean_abs_reasoning": 0.4460451602935791, "adv/mean_abs_step_conf": 0.6659544706344604, "adv/ratio_final_to_reasoning": 1.2186539373300571, "adv/ratio_step_to_reasoning": 1.493020281166465, "adv/std_final_conf": 0.7484281063079834, "adv/std_reasoning": 0.7395012974739075, "adv/std_step_conf": 0.875845730304718, "calib/answer_extract_rate": 0.94921875, "calib/avg_num_step_conf": 14.78125, "calib/ece": 0.325366803840878, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9753086419753086, "calib/gap": 0.03632447791164661, "calib/mean_conf": 0.9715478737997257, "calib/mu_c": 0.9839549999999999, "calib/mu_w": 0.9476305220883533, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3192392318244171, "calib/std_conf": 0.1361281358591226, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4082570292620865, "calib/step_q_c_n": 2096.0, "calib/step_q_gap": -0.06560868953728155, "calib/step_q_w": 0.47386571879936806, "calib/step_q_w_n": 1688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 749.0703125, "completions/mean_terminated_length": 782.7020263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.10453333333333334, "grad_norm": 0.5426382422447205, "kl": 0.229400634765625, "learning_rate": 2.861111111111111e-06, "loss": -0.1304, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.020803090184926987, "mask/share_reasoning": 0.7624001502990723, "mask/share_step_conf": 0.17382797598838806, "num_tokens": 30313315.0, "reward": 0.5548592805862427, "reward_std": 0.24476754665374756, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6406000852584839, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.15427470207214355, "step": 98 }, { "adv/mean_abs_final_conf": 0.6283714771270752, "adv/mean_abs_reasoning": 0.49294739961624146, "adv/mean_abs_step_conf": 0.5275165438652039, "adv/ratio_final_to_reasoning": 1.2747231806400867, "adv/ratio_step_to_reasoning": 1.0701274502632012, "adv/std_final_conf": 0.8293971419334412, "adv/std_reasoning": 0.7396363019943237, "adv/std_step_conf": 0.7765645384788513, "calib/answer_extract_rate": 0.89453125, "calib/avg_num_step_conf": 17.609375, "calib/ece": 0.427187768558952, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9082969432314411, "calib/gap": 0.002509178710178861, "calib/mean_conf": 0.9253537074235808, "calib/mu_c": 0.9264384615384617, "calib/mu_w": 0.9239292828282828, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3924279432314411, "calib/std_conf": 0.23150664868316992, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4221572494669509, "calib/step_q_c_n": 1876.0, "calib/step_q_gap": -0.018625883131833276, "calib/step_q_w": 0.44078313259878416, "calib/step_q_w_n": 2632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 838.328125, "completions/mean_terminated_length": 929.0562744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 249.0, "epoch": 0.1056, "grad_norm": 0.6395625472068787, "kl": 0.195220947265625, "learning_rate": 2.8333333333333335e-06, "loss": -0.2201, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.017325591295957565, "mask/share_reasoning": 0.7251094579696655, "mask/share_step_conf": 0.15990865230560303, "num_tokens": 30633727.0, "reward": 0.4642782509326935, "reward_std": 0.25697386264801025, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5137213468551636, "rewards/format_reward_step": 0.890625, "rewards/step_margin_reward": 0.13514766097068787, "step": 99 }, { "adv/mean_abs_final_conf": 0.6297253966331482, "adv/mean_abs_reasoning": 0.46328994631767273, "adv/mean_abs_step_conf": 0.6763242483139038, "adv/ratio_final_to_reasoning": 1.3592468423680244, "adv/ratio_step_to_reasoning": 1.4598293221976284, "adv/std_final_conf": 0.8469195365905762, "adv/std_reasoning": 0.7577730417251587, "adv/std_step_conf": 0.9062535762786865, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 16.05078125, "calib/ece": 0.249825, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.8983050847457628, "calib/gap": 0.1654154810298103, "calib/mean_conf": 0.9206927966101696, "calib/mu_c": 0.9711585365853659, "calib/mu_w": 0.8057430555555556, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23780127118644068, "calib/std_conf": 0.23762598061704632, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42712190565156466, "calib/step_q_c_n": 2141.0, "calib/step_q_gap": -0.005995872549654835, "calib/step_q_w": 0.4331177782012195, "calib/step_q_w_n": 1968.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 780.60546875, "completions/mean_terminated_length": 843.1856079101562, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.10666666666666667, "grad_norm": 0.790928304195404, "kl": 0.2076416015625, "learning_rate": 2.805555555555556e-06, "loss": -0.2755, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.018736468628048897, "mask/share_reasoning": 0.7489428520202637, "mask/share_step_conf": 0.1581019014120102, "num_tokens": 30940970.0, "reward": 0.617317795753479, "reward_std": 0.25454002618789673, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6920415759086609, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.23009398579597473, "step": 100 }, { "adv/mean_abs_final_conf": 0.6387467384338379, "adv/mean_abs_reasoning": 0.5353177785873413, "adv/mean_abs_step_conf": 0.6979807615280151, "adv/ratio_final_to_reasoning": 1.1932103957380922, "adv/ratio_step_to_reasoning": 1.3038624709418913, "adv/std_final_conf": 0.8217288255691528, "adv/std_reasoning": 0.775704562664032, "adv/std_step_conf": 0.8914257884025574, "calib/answer_extract_rate": 0.91796875, "calib/avg_num_step_conf": 17.5625, "calib/ece": 0.3170382978723405, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.8936170212765957, "calib/gap": 0.1124597315436241, "calib/mean_conf": 0.9138042553191488, "calib/mu_c": 0.9549597315436241, "calib/mu_w": 0.8425, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29840000000000005, "calib/std_conf": 0.24286319584652155, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39851011029411765, "calib/step_q_c_n": 2176.0, "calib/step_q_gap": -0.026448510395537528, "calib/step_q_w": 0.4249586206896552, "calib/step_q_w_n": 2320.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 806.91015625, "completions/mean_terminated_length": 871.59912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.10773333333333333, "grad_norm": 0.9349942803382874, "kl": 0.204559326171875, "learning_rate": 2.7777777777777783e-06, "loss": -0.1962, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.017822718247771263, "mask/share_reasoning": 0.7434788942337036, "mask/share_step_conf": 0.16447967290878296, "num_tokens": 31254531.0, "reward": 0.5653898119926453, "reward_std": 0.2803936004638672, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6268872022628784, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.20389243960380554, "step": 101 }, { "adv/mean_abs_final_conf": 0.6121485829353333, "adv/mean_abs_reasoning": 0.4123254418373108, "adv/mean_abs_step_conf": 0.5459374189376831, "adv/ratio_final_to_reasoning": 1.484624815310004, "adv/ratio_step_to_reasoning": 1.3240449497974247, "adv/std_final_conf": 0.8249315023422241, "adv/std_reasoning": 0.6818413138389587, "adv/std_step_conf": 0.7938175797462463, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 14.37890625, "calib/ece": 0.29191631799163176, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9121338912133892, "calib/gap": 0.05567853134519829, "calib/mean_conf": 0.9319999999999999, "calib/mu_c": 0.9499382716049384, "calib/mu_w": 0.8942597402597401, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27304602510460246, "calib/std_conf": 0.20579792664378005, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42031159793814427, "calib/step_q_c_n": 1940.0, "calib/step_q_gap": 0.06465438943728269, "calib/step_q_w": 0.3556572085008616, "calib/step_q_w_n": 1741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 679.98046875, "completions/mean_terminated_length": 722.3029174804688, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.1088, "grad_norm": 0.8080055713653564, "kl": 0.25323486328125, "learning_rate": 2.7500000000000004e-06, "loss": -0.1598, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.022250540554523468, "mask/share_reasoning": 0.7478595972061157, "mask/share_step_conf": 0.1712961196899414, "num_tokens": 31535302.0, "reward": 0.5693697929382324, "reward_std": 0.23744550347328186, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6525646448135376, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.17211249470710754, "step": 102 }, { "adv/mean_abs_final_conf": 0.5998444557189941, "adv/mean_abs_reasoning": 0.40662068128585815, "adv/mean_abs_step_conf": 0.6629431247711182, "adv/ratio_final_to_reasoning": 1.4751941633221992, "adv/ratio_step_to_reasoning": 1.6303723722922567, "adv/std_final_conf": 0.8152146339416504, "adv/std_reasoning": 0.7015076279640198, "adv/std_step_conf": 0.8602816462516785, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 16.20703125, "calib/ece": 0.2672309322033899, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.885593220338983, "calib/gap": 0.1465941734417343, "calib/mean_conf": 0.8972733050847458, "calib/mu_c": 0.9419969512195122, "calib/mu_w": 0.7954027777777779, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23479449152542375, "calib/std_conf": 0.27585804982234663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4018286864490603, "calib/step_q_c_n": 2022.0, "calib/step_q_gap": 0.074487830783804, "calib/step_q_w": 0.3273408556652563, "calib/step_q_w_n": 2127.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 852.88671875, "completions/mean_terminated_length": 909.7459106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 281.0, "epoch": 0.10986666666666667, "grad_norm": 1.186505675315857, "kl": 0.198944091796875, "learning_rate": 2.7222222222222224e-06, "loss": -0.1797, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.018817514181137085, "mask/share_reasoning": 0.7602624893188477, "mask/share_step_conf": 0.15841998159885406, "num_tokens": 31858193.0, "reward": 0.5926206111907959, "reward_std": 0.25742650032043457, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.675830066204071, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.19691118597984314, "step": 103 }, { "adv/mean_abs_final_conf": 0.6392145156860352, "adv/mean_abs_reasoning": 0.47575998306274414, "adv/mean_abs_step_conf": 0.6251335144042969, "adv/ratio_final_to_reasoning": 1.3435651135916034, "adv/ratio_step_to_reasoning": 1.313968254286433, "adv/std_final_conf": 0.8539537787437439, "adv/std_reasoning": 0.7577002048492432, "adv/std_step_conf": 0.8432413339614868, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 15.88671875, "calib/ece": 0.34682083333333336, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.825, "calib/gap": 0.14368683529749926, "calib/mean_conf": 0.8533208333333333, "calib/mu_c": 0.9119929577464788, "calib/mu_w": 0.7683061224489796, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30423750000000005, "calib/std_conf": 0.3230028785722694, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47077692307692304, "calib/step_q_c_n": 1820.0, "calib/step_q_gap": 0.0288770565882715, "calib/step_q_w": 0.44189986648865154, "calib/step_q_w_n": 2247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 754.546875, "completions/mean_terminated_length": 804.8500366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.11093333333333333, "grad_norm": 1.338552713394165, "kl": 0.223480224609375, "learning_rate": 2.6944444444444444e-06, "loss": -0.2012, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.0200947392731905, "mask/share_reasoning": 0.7502679228782654, "mask/share_step_conf": 0.16713735461235046, "num_tokens": 32158037.0, "reward": 0.574633777141571, "reward_std": 0.2656676769256592, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6113731861114502, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.2402380108833313, "step": 104 }, { "adv/mean_abs_final_conf": 0.5889736413955688, "adv/mean_abs_reasoning": 0.480373740196228, "adv/mean_abs_step_conf": 0.5982500314712524, "adv/ratio_final_to_reasoning": 1.2260737673857418, "adv/ratio_step_to_reasoning": 1.2453845441819387, "adv/std_final_conf": 0.7930524349212646, "adv/std_reasoning": 0.7577711939811707, "adv/std_step_conf": 0.8277463316917419, "calib/answer_extract_rate": 0.91796875, "calib/avg_num_step_conf": 16.32421875, "calib/ece": 0.21246808510638282, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.9106382978723404, "calib/gap": 0.13089104291934484, "calib/mean_conf": 0.9232765957446807, "calib/mu_c": 0.9527967032967033, "calib/mu_w": 0.8219056603773585, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18063829787234026, "calib/std_conf": 0.24153592765023144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.410578872614292, "calib/step_q_c_n": 2253.0, "calib/step_q_gap": -0.012402866741886565, "calib/step_q_w": 0.42298173935617855, "calib/step_q_w_n": 1926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 747.4609375, "completions/mean_terminated_length": 810.8051147460938, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.112, "grad_norm": 1.060556411743164, "kl": 0.228790283203125, "learning_rate": 2.666666666666667e-06, "loss": -0.2401, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.019843213260173798, "mask/share_reasoning": 0.7385556697845459, "mask/share_step_conf": 0.16347616910934448, "num_tokens": 32455147.0, "reward": 0.6941068172454834, "reward_std": 0.25616464018821716, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7257221341133118, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.33671021461486816, "step": 105 }, { "adv/mean_abs_final_conf": 0.5451048612594604, "adv/mean_abs_reasoning": 0.41657865047454834, "adv/mean_abs_step_conf": 0.614454984664917, "adv/ratio_final_to_reasoning": 1.3085280790038103, "adv/ratio_step_to_reasoning": 1.4750035412639522, "adv/std_final_conf": 0.7802751064300537, "adv/std_reasoning": 0.7014001607894897, "adv/std_step_conf": 0.8601456880569458, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.15234375, "calib/ece": 0.31559109311740885, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9068825910931174, "calib/gap": 0.1770145704467352, "calib/mean_conf": 0.9144574898785426, "calib/mu_c": 0.9839733333333334, "calib/mu_w": 0.8069587628865982, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31138056680161935, "calib/std_conf": 0.2604006668620519, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42816789297658864, "calib/step_q_c_n": 1794.0, "calib/step_q_gap": -0.026669668532432633, "calib/step_q_w": 0.45483756150902127, "calib/step_q_w_n": 1829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 743.984375, "completions/mean_terminated_length": 771.0931396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.11306666666666666, "grad_norm": 0.9969174861907959, "kl": 0.2369384765625, "learning_rate": 2.6388888888888893e-06, "loss": -0.0807, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020107656717300415, "mask/share_reasoning": 0.7740259170532227, "mask/share_step_conf": 0.17071017622947693, "num_tokens": 32750191.0, "reward": 0.5974287986755371, "reward_std": 0.22962352633476257, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6597416400909424, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.22495976090431213, "step": 106 }, { "adv/mean_abs_final_conf": 0.5846522450447083, "adv/mean_abs_reasoning": 0.41004979610443115, "adv/mean_abs_step_conf": 0.6936636567115784, "adv/ratio_final_to_reasoning": 1.4258079155240198, "adv/ratio_step_to_reasoning": 1.6916571189683427, "adv/std_final_conf": 0.7986034154891968, "adv/std_reasoning": 0.7014772295951843, "adv/std_step_conf": 0.891394317150116, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.63671875, "calib/ece": 0.26622419999999986, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.896, "calib/gap": 0.1674748723073506, "calib/mean_conf": 0.9120718000000001, "calib/mu_c": 0.9649938596491228, "calib/mu_w": 0.7975189873417722, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24714799999999987, "calib/std_conf": 0.2620013802535399, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3906449145299145, "calib/step_q_c_n": 2340.0, "calib/step_q_gap": 0.0008439166681896149, "calib/step_q_w": 0.3898009978617249, "calib/step_q_w_n": 1403.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 765.4609375, "completions/mean_terminated_length": 783.83203125, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.11413333333333334, "grad_norm": 0.9824727773666382, "kl": 0.240814208984375, "learning_rate": 2.6111111111111113e-06, "loss": -0.0456, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020465373992919922, "mask/share_reasoning": 0.7757344245910645, "mask/share_step_conf": 0.18036270141601562, "num_tokens": 33050765.0, "reward": 0.6862397193908691, "reward_std": 0.25337162613868713, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7182737588882446, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.3260806202888489, "step": 107 }, { "adv/mean_abs_final_conf": 0.4626810848712921, "adv/mean_abs_reasoning": 0.30211400985717773, "adv/mean_abs_step_conf": 0.5840635299682617, "adv/ratio_final_to_reasoning": 1.5314784146886182, "adv/ratio_step_to_reasoning": 1.9332553635773913, "adv/std_final_conf": 0.724582314491272, "adv/std_reasoning": 0.6183889508247375, "adv/std_step_conf": 0.8277202248573303, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.48046875, "calib/ece": 0.2529601593625499, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9083665338645418, "calib/gap": 0.028346978021977942, "calib/mean_conf": 0.9171832669322709, "calib/mu_c": 0.9235076923076924, "calib/mu_w": 0.8951607142857144, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19662549800796822, "calib/std_conf": 0.2634861712294742, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41951341695172933, "calib/step_q_c_n": 2631.0, "calib/step_q_gap": 0.028675312862510016, "calib/step_q_w": 0.3908381040892193, "calib/step_q_w_n": 1076.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 792.234375, "completions/mean_terminated_length": 808.0159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.1152, "grad_norm": 0.8382714986801147, "kl": 0.227691650390625, "learning_rate": 2.5833333333333337e-06, "loss": 0.0008, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020667005330324173, "mask/share_reasoning": 0.7799920439720154, "mask/share_step_conf": 0.17980965971946716, "num_tokens": 33356809.0, "reward": 0.653134286403656, "reward_std": 0.1993504762649536, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7327922582626343, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.22503873705863953, "step": 108 }, { "adv/mean_abs_final_conf": 0.4827290177345276, "adv/mean_abs_reasoning": 0.44099271297454834, "adv/mean_abs_step_conf": 0.5236935615539551, "adv/ratio_final_to_reasoning": 1.094641710695088, "adv/ratio_step_to_reasoning": 1.1875333676640134, "adv/std_final_conf": 0.7577349543571472, "adv/std_reasoning": 0.7205712795257568, "adv/std_step_conf": 0.7936491370201111, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 15.640625, "calib/ece": 0.42427350427350435, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9401709401709402, "calib/gap": 0.0813558201058201, "calib/mean_conf": 0.945982905982906, "calib/mu_c": 0.9835317460317461, "calib/mu_w": 0.902175925925926, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41589743589743594, "calib/std_conf": 0.21411639070286898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4091853107344633, "calib/step_q_c_n": 1593.0, "calib/step_q_gap": -0.013271885449692655, "calib/step_q_w": 0.42245719618415595, "calib/step_q_w_n": 2411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 723.90625, "completions/mean_terminated_length": 785.2542114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.11626666666666667, "grad_norm": 0.6603429913520813, "kl": 0.238128662109375, "learning_rate": 2.5555555555555557e-06, "loss": -0.112, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.01936577633023262, "mask/share_reasoning": 0.7345376014709473, "mask/share_step_conf": 0.1679716408252716, "num_tokens": 33646729.0, "reward": 0.4862062335014343, "reward_std": 0.20336128771305084, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5301535129547119, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.16100899875164032, "step": 109 }, { "adv/mean_abs_final_conf": 0.48091477155685425, "adv/mean_abs_reasoning": 0.40880656242370605, "adv/mean_abs_step_conf": 0.711048424243927, "adv/ratio_final_to_reasoning": 1.176387112539578, "adv/ratio_step_to_reasoning": 1.7393273239752045, "adv/std_final_conf": 0.7066504955291748, "adv/std_reasoning": 0.681603729724884, "adv/std_step_conf": 0.9066959023475647, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.2734375, "calib/ece": 0.2816929133858269, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.937007874015748, "calib/gap": 0.08334684684684668, "calib/mean_conf": 0.9355511811023624, "calib/mu_c": 0.9598333333333332, "calib/mu_w": 0.8764864864864865, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25429133858267733, "calib/std_conf": 0.2419586484050014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4007450095057034, "calib/step_q_c_n": 2104.0, "calib/step_q_gap": -0.028012697623390925, "calib/step_q_w": 0.42875770712909433, "calib/step_q_w_n": 1038.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 725.7421875, "completions/mean_terminated_length": 731.4566650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.11733333333333333, "grad_norm": 0.5370349884033203, "kl": 0.266632080078125, "learning_rate": 2.5277777777777778e-06, "loss": -0.005, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022184470668435097, "mask/share_reasoning": 0.7960182428359985, "mask/share_step_conf": 0.17398479580879211, "num_tokens": 33937439.0, "reward": 0.6677199602127075, "reward_std": 0.2457650750875473, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7123234272003174, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2840539813041687, "step": 110 }, { "adv/mean_abs_final_conf": 0.45591533184051514, "adv/mean_abs_reasoning": 0.48294445872306824, "adv/mean_abs_step_conf": 0.6763150691986084, "adv/ratio_final_to_reasoning": 0.9440326389622119, "adv/ratio_step_to_reasoning": 1.400399273628323, "adv/std_final_conf": 0.7039382457733154, "adv/std_reasoning": 0.7393079400062561, "adv/std_step_conf": 0.8756763339042664, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.30078125, "calib/ece": 0.24377551020408156, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9755102040816327, "calib/gap": 0.054459721487749024, "calib/mean_conf": 0.9743877551020408, "calib/mu_c": 0.9881693989071039, "calib/mu_w": 0.9337096774193548, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23561224489795912, "calib/std_conf": 0.15442068386445038, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3812925341296929, "calib/step_q_c_n": 2344.0, "calib/step_q_gap": 0.0034103017834514193, "calib/step_q_w": 0.37788223234624146, "calib/step_q_w_n": 1317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 751.015625, "completions/mean_terminated_length": 784.7346801757812, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 0.1184, "grad_norm": 0.4570012390613556, "kl": 0.243682861328125, "learning_rate": 2.5e-06, "loss": -0.0591, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020198732614517212, "mask/share_reasoning": 0.7622815370559692, "mask/share_step_conf": 0.17455099523067474, "num_tokens": 34237107.0, "reward": 0.6729134321212769, "reward_std": 0.25619810819625854, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7235041856765747, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2879476547241211, "step": 111 }, { "adv/mean_abs_final_conf": 0.5636082291603088, "adv/mean_abs_reasoning": 0.42544421553611755, "adv/mean_abs_step_conf": 0.5213595628738403, "adv/ratio_final_to_reasoning": 1.324752361364429, "adv/ratio_step_to_reasoning": 1.2254475295118454, "adv/std_final_conf": 0.791305661201477, "adv/std_reasoning": 0.7014791369438171, "adv/std_step_conf": 0.7762733697891235, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 15.14453125, "calib/ece": 0.34247950819672135, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9549180327868853, "calib/gap": 0.023449449973808267, "calib/mean_conf": 0.9535450819672132, "calib/mu_c": 0.9615217391304348, "calib/mu_w": 0.9380722891566265, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31809426229508203, "calib/std_conf": 0.20625821895296445, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33513364183248495, "calib/step_q_c_n": 2161.0, "calib/step_q_gap": -0.007010239286396158, "calib/step_q_w": 0.3421438811188811, "calib/step_q_w_n": 1716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 809.3046875, "completions/mean_terminated_length": 849.1065063476562, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.11946666666666667, "grad_norm": 1.2874290943145752, "kl": 0.214996337890625, "learning_rate": 2.4722222222222226e-06, "loss": -0.0949, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017914071679115295, "mask/share_reasoning": 0.7694779634475708, "mask/share_step_conf": 0.16573293507099152, "num_tokens": 34552209.0, "reward": 0.5713348388671875, "reward_std": 0.23633083701133728, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6264573335647583, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.19980597496032715, "step": 112 }, { "adv/mean_abs_final_conf": 0.5868669152259827, "adv/mean_abs_reasoning": 0.4290392994880676, "adv/mean_abs_step_conf": 0.7190387845039368, "adv/ratio_final_to_reasoning": 1.3678628412973728, "adv/ratio_step_to_reasoning": 1.675927555731837, "adv/std_final_conf": 0.7838603258132935, "adv/std_reasoning": 0.701424777507782, "adv/std_step_conf": 0.9065331816673279, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.671875, "calib/ece": 0.29174206349206355, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9246031746031746, "calib/gap": 0.19411820652173917, "calib/mean_conf": 0.9266626984126984, "calib/mu_c": 0.9975312500000001, "calib/mu_w": 0.8034130434782609, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29174206349206355, "calib/std_conf": 0.25170808811888545, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3293279332351497, "calib/step_q_c_n": 2037.0, "calib/step_q_gap": 0.021470790378006865, "calib/step_q_w": 0.30785714285714283, "calib/step_q_w_n": 1463.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 742.3125, "completions/mean_terminated_length": 754.0952758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.12053333333333334, "grad_norm": 0.7477316856384277, "kl": 0.2469482421875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0327, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02127990871667862, "mask/share_reasoning": 0.7831719517707825, "mask/share_step_conf": 0.17992308735847473, "num_tokens": 34847441.0, "reward": 0.6346909999847412, "reward_std": 0.2597951292991638, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6986355781555176, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.24887137115001678, "step": 113 }, { "adv/mean_abs_final_conf": 0.3718037009239197, "adv/mean_abs_reasoning": 0.30481839179992676, "adv/mean_abs_step_conf": 0.6702938675880432, "adv/ratio_final_to_reasoning": 1.2197548144272081, "adv/ratio_step_to_reasoning": 2.1989941736455427, "adv/std_final_conf": 0.6294357776641846, "adv/std_reasoning": 0.6184821128845215, "adv/std_step_conf": 0.8755671977996826, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.2109375, "calib/ece": 0.17660887096774197, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9395161290322581, "calib/gap": 0.19603357522980136, "calib/mean_conf": 0.9428185483870969, "calib/mu_c": 0.9847128205128205, "calib/mu_w": 0.7886792452830191, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1665685483870968, "calib/std_conf": 0.22223807376359006, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3469958506224066, "calib/step_q_c_n": 2410.0, "calib/step_q_gap": 0.03685334247908417, "calib/step_q_w": 0.31014250814332245, "calib/step_q_w_n": 1228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 763.73046875, "completions/mean_terminated_length": 788.3668823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.1216, "grad_norm": 0.5136744976043701, "kl": 0.23712158203125, "learning_rate": 2.4166666666666667e-06, "loss": -0.0732, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.021277498453855515, "mask/share_reasoning": 0.7698897123336792, "mask/share_step_conf": 0.17758281528949738, "num_tokens": 35147980.0, "reward": 0.6923535466194153, "reward_std": 0.19236746430397034, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7982048988342285, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.24040833115577698, "step": 114 }, { "adv/mean_abs_final_conf": 0.5842900276184082, "adv/mean_abs_reasoning": 0.4125009775161743, "adv/mean_abs_step_conf": 0.6591264009475708, "adv/ratio_final_to_reasoning": 1.4164573163841727, "adv/ratio_step_to_reasoning": 1.597878397565073, "adv/std_final_conf": 0.7837476134300232, "adv/std_reasoning": 0.6817185282707214, "adv/std_step_conf": 0.8758804202079773, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 13.6640625, "calib/ece": 0.3283157894736841, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8623481781376519, "calib/gap": 0.15629926296759833, "calib/mean_conf": 0.867668016194332, "calib/mu_c": 0.9271503267973855, "calib/mu_w": 0.7708510638297872, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2882753036437246, "calib/std_conf": 0.3242157949415586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33306884057971015, "calib/step_q_c_n": 1932.0, "calib/step_q_gap": 0.031195915930923468, "calib/step_q_w": 0.3018729246487867, "calib/step_q_w_n": 1566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 722.2109375, "completions/mean_terminated_length": 748.5263061523438, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.12266666666666666, "grad_norm": 0.931440532207489, "kl": 0.24176025390625, "learning_rate": 2.388888888888889e-06, "loss": -0.1057, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020129762589931488, "mask/share_reasoning": 0.7740639448165894, "mask/share_step_conf": 0.17065003514289856, "num_tokens": 35438130.0, "reward": 0.564358115196228, "reward_std": 0.24982187151908875, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6476209759712219, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.16859526932239532, "step": 115 }, { "adv/mean_abs_final_conf": 0.629609227180481, "adv/mean_abs_reasoning": 0.4123613238334656, "adv/mean_abs_step_conf": 0.6170235276222229, "adv/ratio_final_to_reasoning": 1.526838698953135, "adv/ratio_step_to_reasoning": 1.4963176514376777, "adv/std_final_conf": 0.8164438009262085, "adv/std_reasoning": 0.6817293167114258, "adv/std_step_conf": 0.8592019081115723, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 16.59765625, "calib/ece": 0.2614959016393441, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8032786885245902, "calib/gap": 0.25743358047574905, "calib/mean_conf": 0.8067418032786884, "calib/mu_c": 0.8890361445783131, "calib/mu_w": 0.6316025641025641, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19395491803278672, "calib/std_conf": 0.37868427706350427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3237091684434968, "calib/step_q_c_n": 2345.0, "calib/step_q_gap": 0.02345181550232034, "calib/step_q_w": 0.30025735294117645, "calib/step_q_w_n": 1904.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 832.04296875, "completions/mean_terminated_length": 872.9630737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.12373333333333333, "grad_norm": 1.2763618230819702, "kl": 0.215667724609375, "learning_rate": 2.361111111111111e-06, "loss": -0.0845, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018999746069312096, "mask/share_reasoning": 0.766923725605011, "mask/share_step_conf": 0.16720154881477356, "num_tokens": 35755653.0, "reward": 0.6194130778312683, "reward_std": 0.2369731217622757, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7006518244743347, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.21786174178123474, "step": 116 }, { "adv/mean_abs_final_conf": 0.7082833051681519, "adv/mean_abs_reasoning": 0.48807835578918457, "adv/mean_abs_step_conf": 0.6436492204666138, "adv/ratio_final_to_reasoning": 1.4511672086399183, "adv/ratio_step_to_reasoning": 1.3187415767000839, "adv/std_final_conf": 0.8633624911308289, "adv/std_reasoning": 0.7394745349884033, "adv/std_step_conf": 0.8554308414459229, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 16.33203125, "calib/ece": 0.3534663865546218, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.21689822755860483, "calib/mean_conf": 0.6089285714285715, "calib/mu_c": 0.705530303030303, "calib/mu_w": 0.4886320754716981, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20388655462184874, "calib/std_conf": 0.46709724837101574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3297563805104408, "calib/step_q_c_n": 1724.0, "calib/step_q_gap": 0.05719838295244328, "calib/step_q_w": 0.27255799755799753, "calib/step_q_w_n": 2457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 753.875, "completions/mean_terminated_length": 810.8908081054688, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.1248, "grad_norm": 2.708895683288574, "kl": 0.22900390625, "learning_rate": 2.3333333333333336e-06, "loss": -0.2058, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.018687278032302856, "mask/share_reasoning": 0.7465159296989441, "mask/share_step_conf": 0.16448429226875305, "num_tokens": 36055245.0, "reward": 0.5642485618591309, "reward_std": 0.2435990869998932, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5940789580345154, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.24535568058490753, "step": 117 }, { "adv/mean_abs_final_conf": 0.7272151708602905, "adv/mean_abs_reasoning": 0.31207209825515747, "adv/mean_abs_step_conf": 0.6416611075401306, "adv/ratio_final_to_reasoning": 2.3302793646925215, "adv/ratio_step_to_reasoning": 2.0561309746297582, "adv/std_final_conf": 0.8775283694267273, "adv/std_reasoning": 0.5962818264961243, "adv/std_step_conf": 0.8595139384269714, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 16.84375, "calib/ece": 0.27910041841004174, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5774058577405857, "calib/gap": 0.2964238190286095, "calib/mean_conf": 0.6315690376569039, "calib/mu_c": 0.7208682634730539, "calib/mu_w": 0.4244444444444444, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10596234309623422, "calib/std_conf": 0.4492533862604019, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3522909090909091, "calib/step_q_c_n": 2200.0, "calib/step_q_gap": -0.018520643939393955, "calib/step_q_w": 0.37081155303030305, "calib/step_q_w_n": 2112.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 812.98046875, "completions/mean_terminated_length": 867.17919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.12586666666666665, "grad_norm": 4.6349310874938965, "kl": 0.223388671875, "learning_rate": 2.305555555555556e-06, "loss": -0.197, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01810162514448166, "mask/share_reasoning": 0.7572929263114929, "mask/share_step_conf": 0.16210542619228363, "num_tokens": 36367376.0, "reward": 0.5802870988845825, "reward_std": 0.21968314051628113, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6609405279159546, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.18244624137878418, "step": 118 }, { "adv/mean_abs_final_conf": 0.6874723434448242, "adv/mean_abs_reasoning": 0.44178149104118347, "adv/mean_abs_step_conf": 0.6230811476707458, "adv/ratio_final_to_reasoning": 1.5561365910205982, "adv/ratio_step_to_reasoning": 1.4103830973141909, "adv/std_final_conf": 0.8441257476806641, "adv/std_reasoning": 0.7208822965621948, "adv/std_step_conf": 0.8435025811195374, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 18.06640625, "calib/ece": 0.32571120689655175, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.6681034482758621, "calib/gap": 0.13268315018315013, "calib/mean_conf": 0.7221767241379311, "calib/mu_c": 0.7667857142857143, "calib/mu_w": 0.6341025641025642, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1920474137931035, "calib/std_conf": 0.41018065697298056, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32542656325739217, "calib/step_q_c_n": 2063.0, "calib/step_q_gap": -0.023979369607557033, "calib/step_q_w": 0.3494059328649492, "calib/step_q_w_n": 2562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 787.16015625, "completions/mean_terminated_length": 864.8626708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.12693333333333334, "grad_norm": 1.767160177230835, "kl": 0.221343994140625, "learning_rate": 2.277777777777778e-06, "loss": -0.2612, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.018292531371116638, "mask/share_reasoning": 0.7322384715080261, "mask/share_step_conf": 0.15962526202201843, "num_tokens": 36673953.0, "reward": 0.5670791864395142, "reward_std": 0.2390528917312622, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6021067500114441, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.2304891049861908, "step": 119 }, { "adv/mean_abs_final_conf": 0.546036958694458, "adv/mean_abs_reasoning": 0.3437703847885132, "adv/mean_abs_step_conf": 0.704839289188385, "adv/ratio_final_to_reasoning": 1.5883769599012398, "adv/ratio_step_to_reasoning": 2.050319982106663, "adv/std_final_conf": 0.753393292427063, "adv/std_reasoning": 0.6403025984764099, "adv/std_step_conf": 0.9208722710609436, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 13.94140625, "calib/ece": 0.18858775510204093, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7346938775510204, "calib/gap": 0.4150723881776512, "calib/mean_conf": 0.7817306122448979, "calib/mu_c": 0.9070994152046782, "calib/mu_w": 0.492027027027027, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1361795918367348, "calib/std_conf": 0.38059433768675144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34108988764044945, "calib/step_q_c_n": 1958.0, "calib/step_q_gap": -0.037941769715230333, "calib/step_q_w": 0.3790316573556798, "calib/step_q_w_n": 1611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 725.33203125, "completions/mean_terminated_length": 757.89794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.128, "grad_norm": 1.3622678518295288, "kl": 0.246917724609375, "learning_rate": 2.25e-06, "loss": -0.0768, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020279962569475174, "mask/share_reasoning": 0.7692380547523499, "mask/share_step_conf": 0.16751320660114288, "num_tokens": 36966326.0, "reward": 0.6724545955657959, "reward_std": 0.179477721452713, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7735120058059692, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.2471783459186554, "step": 120 }, { "adv/mean_abs_final_conf": 0.6599460244178772, "adv/mean_abs_reasoning": 0.5737216472625732, "adv/mean_abs_step_conf": 0.6109805107116699, "adv/ratio_final_to_reasoning": 1.1502895656224768, "adv/ratio_step_to_reasoning": 1.0649424047826532, "adv/std_final_conf": 0.8613766431808472, "adv/std_reasoning": 0.8099164962768555, "adv/std_step_conf": 0.8437104821205139, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 15.734375, "calib/ece": 0.2038589211618257, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7510373443983402, "calib/gap": 0.32272148257725175, "calib/mean_conf": 0.7997095435684648, "calib/mu_c": 0.8961242603550296, "calib/mu_w": 0.5734027777777778, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15116182572614104, "calib/std_conf": 0.3613657224261938, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3475085638998683, "calib/step_q_c_n": 2277.0, "calib/step_q_gap": 0.0032252971951282072, "calib/step_q_w": 0.3442832667047401, "calib/step_q_w_n": 1751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 834.1171875, "completions/mean_terminated_length": 886.0332641601562, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.12906666666666666, "grad_norm": 1.4408384561538696, "kl": 0.223052978515625, "learning_rate": 2.222222222222222e-06, "loss": -0.1841, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017796337604522705, "mask/share_reasoning": 0.7621406316757202, "mask/share_step_conf": 0.16146929562091827, "num_tokens": 37284916.0, "reward": 0.6500805616378784, "reward_std": 0.2570773661136627, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7394177913665771, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.2404308319091797, "step": 121 }, { "adv/mean_abs_final_conf": 0.4568904638290405, "adv/mean_abs_reasoning": 0.31823331117630005, "adv/mean_abs_step_conf": 0.6336137652397156, "adv/ratio_final_to_reasoning": 1.4357091095844612, "adv/ratio_step_to_reasoning": 1.9910353284439666, "adv/std_final_conf": 0.7229413986206055, "adv/std_reasoning": 0.5960581302642822, "adv/std_step_conf": 0.875344455242157, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 15.18359375, "calib/ece": 0.18924686192468615, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8451882845188284, "calib/gap": 0.35369532926225444, "calib/mean_conf": 0.8679916317991632, "calib/mu_c": 0.9760240963855422, "calib/mu_w": 0.6223287671232878, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18133891213389117, "calib/std_conf": 0.3185097752305585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38515778582514226, "calib/step_q_c_n": 1933.0, "calib/step_q_gap": -0.04652082215848108, "calib/step_q_w": 0.43167860798362334, "calib/step_q_w_n": 1954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 786.87109375, "completions/mean_terminated_length": 839.3292236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.13013333333333332, "grad_norm": 1.0047415494918823, "kl": 0.219940185546875, "learning_rate": 2.1944444444444445e-06, "loss": -0.1253, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01762072555720806, "mask/share_reasoning": 0.7645582556724548, "mask/share_step_conf": 0.15532098710536957, "num_tokens": 37593699.0, "reward": 0.6291664838790894, "reward_std": 0.19675558805465698, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.75284743309021, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.18907909095287323, "step": 122 }, { "adv/mean_abs_final_conf": 0.5483672618865967, "adv/mean_abs_reasoning": 0.5403405427932739, "adv/mean_abs_step_conf": 0.5864270925521851, "adv/ratio_final_to_reasoning": 1.0148549265835742, "adv/ratio_step_to_reasoning": 1.0852916746181365, "adv/std_final_conf": 0.7761465311050415, "adv/std_reasoning": 0.7929040193557739, "adv/std_step_conf": 0.8109169602394104, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 15.0859375, "calib/ece": 0.2846694214876034, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9049586776859504, "calib/gap": 0.17493440667859284, "calib/mean_conf": 0.9196280991735538, "calib/mu_c": 0.9817948717948719, "calib/mu_w": 0.8068604651162791, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27983471074380173, "calib/std_conf": 0.2568951335149414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3788781163434903, "calib/step_q_c_n": 2166.0, "calib/step_q_gap": -0.016869525165943755, "calib/step_q_w": 0.39574764150943403, "calib/step_q_w_n": 1696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 852.58984375, "completions/mean_terminated_length": 898.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.1312, "grad_norm": 1.0780717134475708, "kl": 0.21270751953125, "learning_rate": 2.166666666666667e-06, "loss": -0.2034, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017740830779075623, "mask/share_reasoning": 0.7722517251968384, "mask/share_step_conf": 0.1592261791229248, "num_tokens": 37917250.0, "reward": 0.5830578207969666, "reward_std": 0.24974925816059113, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6706482172012329, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.1845298558473587, "step": 123 }, { "adv/mean_abs_final_conf": 0.4309426546096802, "adv/mean_abs_reasoning": 0.44589105248451233, "adv/mean_abs_step_conf": 0.6884101033210754, "adv/ratio_final_to_reasoning": 0.9664752235068647, "adv/ratio_step_to_reasoning": 1.5438975496037495, "adv/std_final_conf": 0.703031063079834, "adv/std_reasoning": 0.7206540703773499, "adv/std_step_conf": 0.8913577198982239, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 14.015625, "calib/ece": 0.20929752066115706, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8966942148760331, "calib/gap": 0.15107758620689649, "calib/mean_conf": 0.9200413223140496, "calib/mu_c": 0.9562499999999999, "calib/mu_w": 0.8051724137931034, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.184504132231405, "calib/std_conf": 0.24728525692500625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4224380762411347, "calib/step_q_c_n": 2256.0, "calib/step_q_gap": -0.04011072255766407, "calib/step_q_w": 0.4625487987987988, "calib/step_q_w_n": 1332.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 774.3203125, "completions/mean_terminated_length": 819.1156616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.13226666666666667, "grad_norm": 0.6981167793273926, "kl": 0.224945068359375, "learning_rate": 2.138888888888889e-06, "loss": -0.1673, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018614403903484344, "mask/share_reasoning": 0.7690895199775696, "mask/share_step_conf": 0.1576085388660431, "num_tokens": 38222292.0, "reward": 0.669231116771698, "reward_std": 0.23245593905448914, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7431816458702087, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.26324930787086487, "step": 124 }, { "adv/mean_abs_final_conf": 0.5283767580986023, "adv/mean_abs_reasoning": 0.4329782724380493, "adv/mean_abs_step_conf": 0.5721372365951538, "adv/ratio_final_to_reasoning": 1.220330884326771, "adv/ratio_step_to_reasoning": 1.3213994165885434, "adv/std_final_conf": 0.776694655418396, "adv/std_reasoning": 0.720791220664978, "adv/std_step_conf": 0.8109546899795532, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 13.53125, "calib/ece": 0.28979591836734697, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8367346938775511, "calib/gap": 0.18335968081903042, "calib/mean_conf": 0.8522448979591837, "calib/mu_c": 0.9128658536585367, "calib/mu_w": 0.7295061728395063, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23632653061224493, "calib/std_conf": 0.3435865176735409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41351314804917155, "calib/step_q_c_n": 1871.0, "calib/step_q_gap": -0.040548371097093305, "calib/step_q_w": 0.45406151914626486, "calib/step_q_w_n": 1593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 767.30859375, "completions/mean_terminated_length": 798.5, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.13333333333333333, "grad_norm": 0.6414250731468201, "kl": 0.235382080078125, "learning_rate": 2.1111111111111114e-06, "loss": -0.1151, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020531386137008667, "mask/share_reasoning": 0.7824704647064209, "mask/share_step_conf": 0.15793566405773163, "num_tokens": 38523531.0, "reward": 0.6279151439666748, "reward_std": 0.24993808567523956, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6779242157936096, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2583746910095215, "step": 125 }, { "adv/mean_abs_final_conf": 0.5080631971359253, "adv/mean_abs_reasoning": 0.4330710768699646, "adv/mean_abs_step_conf": 0.5981600880622864, "adv/ratio_final_to_reasoning": 1.1731635388998238, "adv/ratio_step_to_reasoning": 1.3812053494440404, "adv/std_final_conf": 0.7579245567321777, "adv/std_reasoning": 0.7016152739524841, "adv/std_step_conf": 0.8276919722557068, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 14.2109375, "calib/ece": 0.292983193277311, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.865546218487395, "calib/gap": 0.1315063291139239, "calib/mean_conf": 0.9013025210084034, "calib/mu_c": 0.945506329113924, "calib/mu_w": 0.8140000000000001, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2652100840336135, "calib/std_conf": 0.26951946203161764, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40487768969422416, "calib/step_q_c_n": 1766.0, "calib/step_q_gap": 0.024452476019010516, "calib/step_q_w": 0.38042521367521365, "calib/step_q_w_n": 1872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 739.34375, "completions/mean_terminated_length": 795.2605590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.1344, "grad_norm": 0.9234392642974854, "kl": 0.234893798828125, "learning_rate": 2.0833333333333334e-06, "loss": -0.2053, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.020162848755717278, "mask/share_reasoning": 0.7513543367385864, "mask/share_step_conf": 0.15817034244537354, "num_tokens": 38818267.0, "reward": 0.5585942268371582, "reward_std": 0.23993246257305145, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6568480730056763, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.1509653478860855, "step": 126 }, { "adv/mean_abs_final_conf": 0.45118528604507446, "adv/mean_abs_reasoning": 0.4313002824783325, "adv/mean_abs_step_conf": 0.6212649345397949, "adv/ratio_final_to_reasoning": 1.0461047775171373, "adv/ratio_step_to_reasoning": 1.440446389160447, "adv/std_final_conf": 0.7026150822639465, "adv/std_reasoning": 0.7016158699989319, "adv/std_step_conf": 0.8277674317359924, "calib/answer_extract_rate": 0.92578125, "calib/avg_num_step_conf": 13.5390625, "calib/ece": 0.31177215189873425, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.8860759493670886, "calib/gap": 0.12139142407553105, "calib/mean_conf": 0.8983544303797468, "calib/mu_c": 0.9403548387096774, "calib/mu_w": 0.8189634146341463, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27805907172995786, "calib/std_conf": 0.290054616040288, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.446405503057254, "calib/step_q_c_n": 1799.0, "calib/step_q_gap": 0.031089966164632554, "calib/step_q_w": 0.41531553689262146, "calib/step_q_w_n": 1667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 690.59765625, "completions/mean_terminated_length": 742.8277587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.13546666666666668, "grad_norm": 0.7550332546234131, "kl": 0.249603271484375, "learning_rate": 2.0555555555555555e-06, "loss": -0.1629, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.02026120387017727, "mask/share_reasoning": 0.7483981847763062, "mask/share_step_conf": 0.16102807223796844, "num_tokens": 39098732.0, "reward": 0.5520787239074707, "reward_std": 0.24869316816329956, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6339927911758423, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.16391471028327942, "step": 127 }, { "adv/mean_abs_final_conf": 0.5826007127761841, "adv/mean_abs_reasoning": 0.4872943162918091, "adv/mean_abs_step_conf": 0.5902491807937622, "adv/ratio_final_to_reasoning": 1.195582819864663, "adv/ratio_step_to_reasoning": 1.2112786073217816, "adv/std_final_conf": 0.8118354678153992, "adv/std_reasoning": 0.7394742369651794, "adv/std_step_conf": 0.827497124671936, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 14.203125, "calib/ece": 0.29087866108786625, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8702928870292888, "calib/gap": 0.2160395010395012, "calib/mean_conf": 0.9030125523012552, "calib/mu_c": 0.9852702702702703, "calib/mu_w": 0.769230769230769, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28732217573221774, "calib/std_conf": 0.2635021691521325, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42852554961378486, "calib/step_q_c_n": 1683.0, "calib/step_q_gap": 0.05191356804696462, "calib/step_q_w": 0.37661198156682024, "calib/step_q_w_n": 1953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 730.25390625, "completions/mean_terminated_length": 782.1966552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.13653333333333334, "grad_norm": 0.9753623604774475, "kl": 0.23822021484375, "learning_rate": 2.027777777777778e-06, "loss": -0.2389, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.020317887887358665, "mask/share_reasoning": 0.7514255046844482, "mask/share_step_conf": 0.16185034811496735, "num_tokens": 39392341.0, "reward": 0.5939993262290955, "reward_std": 0.26951467990875244, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6685829758644104, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.2170717716217041, "step": 128 }, { "adv/mean_abs_final_conf": 0.4442763328552246, "adv/mean_abs_reasoning": 0.45170193910598755, "adv/mean_abs_step_conf": 0.6424069404602051, "adv/ratio_final_to_reasoning": 0.9835608271563772, "adv/ratio_step_to_reasoning": 1.422192124593626, "adv/std_final_conf": 0.7024354934692383, "adv/std_reasoning": 0.7207036018371582, "adv/std_step_conf": 0.8758267164230347, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 13.2734375, "calib/ece": 0.24002857142857154, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7918367346938775, "calib/gap": 0.2521655696943106, "calib/mean_conf": 0.8419061224489796, "calib/mu_c": 0.9304213836477988, "calib/mu_w": 0.6782558139534882, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21647755102040828, "calib/std_conf": 0.32542536584910803, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4621792763157895, "calib/step_q_c_n": 1824.0, "calib/step_q_gap": 0.05840926360930926, "calib/step_q_w": 0.4037700127064802, "calib/step_q_w_n": 1574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 692.5078125, "completions/mean_terminated_length": 723.5999755859375, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.1376, "grad_norm": 0.7908511757850647, "kl": 0.26397705078125, "learning_rate": 2.0000000000000003e-06, "loss": -0.1356, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.022384271025657654, "mask/share_reasoning": 0.7689116597175598, "mask/share_step_conf": 0.16573531925678253, "num_tokens": 39672007.0, "reward": 0.6568441390991211, "reward_std": 0.1993679404258728, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7119947075843811, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2860685884952545, "step": 129 }, { "adv/mean_abs_final_conf": 0.3676624596118927, "adv/mean_abs_reasoning": 0.19307512044906616, "adv/mean_abs_step_conf": 0.5610698461532593, "adv/ratio_final_to_reasoning": 1.9042456571139794, "adv/ratio_step_to_reasoning": 2.905966573260646, "adv/std_final_conf": 0.6428574323654175, "adv/std_reasoning": 0.495947003364563, "adv/std_step_conf": 0.8103545308113098, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 12.69921875, "calib/ece": 0.14574596774193543, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8064516129032258, "calib/gap": 0.38447490347490376, "calib/mean_conf": 0.8525201612903226, "calib/mu_c": 0.9501891891891894, "calib/mu_w": 0.5657142857142856, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12614919354838705, "calib/std_conf": 0.31570038559445907, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4415043301069791, "calib/step_q_c_n": 1963.0, "calib/step_q_gap": 0.04577063445480517, "calib/step_q_w": 0.3957336956521739, "calib/step_q_w_n": 1288.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 663.4609375, "completions/mean_terminated_length": 684.8628540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 316.0, "epoch": 0.13866666666666666, "grad_norm": 1.1481865644454956, "kl": 0.264801025390625, "learning_rate": 1.9722222222222224e-06, "loss": -0.1471, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02265078015625477, "mask/share_reasoning": 0.7670093774795532, "mask/share_step_conf": 0.17908982932567596, "num_tokens": 39947141.0, "reward": 0.7057443857192993, "reward_std": 0.14136728644371033, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8187835216522217, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.25442397594451904, "step": 130 }, { "adv/mean_abs_final_conf": 0.6045816540718079, "adv/mean_abs_reasoning": 0.42527127265930176, "adv/mean_abs_step_conf": 0.5815385580062866, "adv/ratio_final_to_reasoning": 1.4216376532824433, "adv/ratio_step_to_reasoning": 1.3674531890428807, "adv/std_final_conf": 0.7785736918449402, "adv/std_reasoning": 0.7207863926887512, "adv/std_step_conf": 0.8273931741714478, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 15.6015625, "calib/ece": 0.15808189655172422, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.4827586206896552, "calib/gap": 0.5209051724137934, "calib/mean_conf": 0.5738577586206897, "calib/mu_c": 0.8343103448275864, "calib/mu_w": 0.313405172413793, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11596982758620697, "calib/std_conf": 0.4387053460233405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38929024583663757, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.10923810533169798, "calib/step_q_w": 0.2800521405049396, "calib/step_q_w_n": 2733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 698.453125, "completions/mean_terminated_length": 764.1196899414062, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.13973333333333332, "grad_norm": 1.5745195150375366, "kl": 0.25299072265625, "learning_rate": 1.944444444444445e-06, "loss": -0.2455, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.0197146013379097, "mask/share_reasoning": 0.7379894256591797, "mask/share_step_conf": 0.15635845065116882, "num_tokens": 40232153.0, "reward": 0.6144878268241882, "reward_std": 0.19122637808322906, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7363600730895996, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.22074052691459656, "step": 131 }, { "adv/mean_abs_final_conf": 0.580619215965271, "adv/mean_abs_reasoning": 0.43576955795288086, "adv/mean_abs_step_conf": 0.606571614742279, "adv/ratio_final_to_reasoning": 1.332399671727534, "adv/ratio_step_to_reasoning": 1.3919549993160991, "adv/std_final_conf": 0.8146044015884399, "adv/std_reasoning": 0.6819318532943726, "adv/std_step_conf": 0.8462624549865723, "calib/answer_extract_rate": 0.8984375, "calib/avg_num_step_conf": 15.38671875, "calib/ece": 0.2710217391304348, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.44782608695652176, "calib/gap": 0.573583509513742, "calib/mean_conf": 0.5486739130434782, "calib/mu_c": 0.6559090909090909, "calib/mu_w": 0.08232558139534883, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003326086956521741, "calib/std_conf": 0.4409134320213686, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35733181818181814, "calib/step_q_c_n": 2200.0, "calib/step_q_gap": 0.10112014480631498, "calib/step_q_w": 0.25621167337550316, "calib/step_q_w_n": 1739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 664.88671875, "completions/mean_terminated_length": 740.0477905273438, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.1408, "grad_norm": 2.7517290115356445, "kl": 0.253387451171875, "learning_rate": 1.916666666666667e-06, "loss": -0.2744, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.020201554521918297, "mask/share_reasoning": 0.7178224325180054, "mask/share_step_conf": 0.16041353344917297, "num_tokens": 40507956.0, "reward": 0.6151009798049927, "reward_std": 0.19379822909832, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.6810821294784546, "rewards/format_reward_step": 0.8984375, "rewards/step_margin_reward": 0.22333842515945435, "step": 132 }, { "adv/mean_abs_final_conf": 0.8137380480766296, "adv/mean_abs_reasoning": 0.7138146162033081, "adv/mean_abs_step_conf": 0.6698095202445984, "adv/ratio_final_to_reasoning": 1.1399851300395079, "adv/ratio_step_to_reasoning": 0.9383522066376738, "adv/std_final_conf": 0.9363965392112732, "adv/std_reasoning": 0.8907052278518677, "adv/std_step_conf": 0.8721390962600708, "calib/answer_extract_rate": 0.8125, "calib/avg_num_step_conf": 20.48046875, "calib/ece": 0.33415865384615384, "calib/final_conf_rate": 0.8125, "calib/format_rate": 0.8125, "calib/frac_conf_gt_0.9": 0.17307692307692307, "calib/gap": 0.26145075757575753, "calib/mean_conf": 0.2786778846153846, "calib/mu_c": 0.38929166666666665, "calib/mu_w": 0.1278409090909091, "calib/nonempty_final_conf_rate": 0.8125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01795673076923077, "calib/std_conf": 0.3670671274431576, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3245042345276873, "calib/step_q_c_n": 1535.0, "calib/step_q_gap": 0.13698603064419215, "calib/step_q_w": 0.18751820388349513, "calib/step_q_w_n": 3708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 703.8203125, "completions/mean_terminated_length": 866.2404174804688, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.14186666666666667, "grad_norm": 7.1175689697265625, "kl": 0.2830810546875, "learning_rate": 1.888888888888889e-06, "loss": -0.5345, "mask/has_final_conf_rate": 0.8125, "mask/share_final_conf": 0.015356351621448994, "mask/share_reasoning": 0.6622164249420166, "mask/share_step_conf": 0.13492724299430847, "num_tokens": 40794478.0, "reward": 0.516724705696106, "reward_std": 0.2438364326953888, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.536136269569397, "rewards/format_reward_step": 0.8125, "rewards/step_margin_reward": 0.2410632073879242, "step": 133 }, { "adv/mean_abs_final_conf": 0.7017267942428589, "adv/mean_abs_reasoning": 0.5476149320602417, "adv/mean_abs_step_conf": 0.6388225555419922, "adv/ratio_final_to_reasoning": 1.281423776380268, "adv/ratio_step_to_reasoning": 1.166554303292294, "adv/std_final_conf": 0.8980323672294617, "adv/std_reasoning": 0.8100696802139282, "adv/std_step_conf": 0.8600804209709167, "calib/answer_extract_rate": 0.88671875, "calib/avg_num_step_conf": 17.6953125, "calib/ece": 0.36605726872246686, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.2026431718061674, "calib/gap": 0.3028257463301996, "calib/mean_conf": 0.3038546255506608, "calib/mu_c": 0.418581560283688, "calib/mu_w": 0.11575581395348837, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024383259911894245, "calib/std_conf": 0.3899412708669093, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2956536694677871, "calib/step_q_c_n": 1785.0, "calib/step_q_gap": 0.08443665671733172, "calib/step_q_w": 0.21121701275045537, "calib/step_q_w_n": 2745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 770.0234375, "completions/mean_terminated_length": 860.812255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.14293333333333333, "grad_norm": 3.8303635120391846, "kl": 0.224609375, "learning_rate": 1.8611111111111113e-06, "loss": -0.2861, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.017552226781845093, "mask/share_reasoning": 0.7260377407073975, "mask/share_step_conf": 0.15094125270843506, "num_tokens": 41100556.0, "reward": 0.51554274559021, "reward_std": 0.19827260076999664, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5803333520889282, "rewards/format_reward_step": 0.88671875, "rewards/step_margin_reward": 0.16325227916240692, "step": 134 }, { "adv/mean_abs_final_conf": 0.7464656829833984, "adv/mean_abs_reasoning": 0.5359787940979004, "adv/mean_abs_step_conf": 0.6636673808097839, "adv/ratio_final_to_reasoning": 1.3927149566425778, "adv/ratio_step_to_reasoning": 1.2382344005359291, "adv/std_final_conf": 0.9189602136611938, "adv/std_reasoning": 0.7931401133537292, "adv/std_step_conf": 0.8599939346313477, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 16.0703125, "calib/ece": 0.36658119658119653, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.2264957264957265, "calib/gap": 0.21275192554557118, "calib/mean_conf": 0.38722222222222225, "calib/mu_c": 0.46177631578947365, "calib/mu_w": 0.24902439024390247, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0521153846153846, "calib/std_conf": 0.38707507412703074, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2987839958158996, "calib/step_q_c_n": 1912.0, "calib/step_q_gap": 0.08667818291853355, "calib/step_q_w": 0.21210581289736605, "calib/step_q_w_n": 2202.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 716.93359375, "completions/mean_terminated_length": 784.337646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.144, "grad_norm": 4.714707851409912, "kl": 0.2374267578125, "learning_rate": 1.8333333333333333e-06, "loss": -0.3072, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.01891816221177578, "mask/share_reasoning": 0.7368408441543579, "mask/share_step_conf": 0.15830349922180176, "num_tokens": 41389971.0, "reward": 0.6003228425979614, "reward_std": 0.2147035449743271, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.5946650505065918, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.3044181168079376, "step": 135 }, { "adv/mean_abs_final_conf": 0.6705352067947388, "adv/mean_abs_reasoning": 0.5252009034156799, "adv/mean_abs_step_conf": 0.5619533658027649, "adv/ratio_final_to_reasoning": 1.27672135069431, "adv/ratio_step_to_reasoning": 1.0699779115916648, "adv/std_final_conf": 0.8491891026496887, "adv/std_reasoning": 0.7930968999862671, "adv/std_step_conf": 0.8269990086555481, "calib/answer_extract_rate": 0.8515625, "calib/avg_num_step_conf": 17.96484375, "calib/ece": 0.14999082568807334, "calib/final_conf_rate": 0.8515625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.42201834862385323, "calib/gap": 0.5577160548429898, "calib/mean_conf": 0.5046697247706422, "calib/mu_c": 0.7221278195488722, "calib/mu_w": 0.1644117647058824, "calib/nonempty_final_conf_rate": 0.8515625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.022284403669724735, "calib/std_conf": 0.4448066800997846, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33198000000000005, "calib/step_q_c_n": 1485.0, "calib/step_q_gap": 0.1005222286448299, "calib/step_q_w": 0.23145777135517015, "calib/step_q_w_n": 3114.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 652.3203125, "completions/mean_terminated_length": 766.0274658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.14506666666666668, "grad_norm": 1.8986977338790894, "kl": 0.254119873046875, "learning_rate": 1.8055555555555557e-06, "loss": -0.4585, "mask/has_final_conf_rate": 0.8515625, "mask/share_final_conf": 0.018733292818069458, "mask/share_reasoning": 0.6779872179031372, "mask/share_step_conf": 0.15484192967414856, "num_tokens": 41665453.0, "reward": 0.583008885383606, "reward_std": 0.22099415957927704, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6969972848892212, "rewards/format_reward_step": 0.8515625, "rewards/step_margin_reward": 0.19480186700820923, "step": 136 }, { "adv/mean_abs_final_conf": 0.6077395677566528, "adv/mean_abs_reasoning": 0.3589785397052765, "adv/mean_abs_step_conf": 0.52858567237854, "adv/ratio_final_to_reasoning": 1.6929690790307705, "adv/ratio_step_to_reasoning": 1.4724715098916832, "adv/std_final_conf": 0.8171355724334717, "adv/std_reasoning": 0.6407114863395691, "adv/std_step_conf": 0.776058554649353, "calib/answer_extract_rate": 0.87890625, "calib/avg_num_step_conf": 17.68359375, "calib/ece": 0.20969333333333343, "calib/final_conf_rate": 0.87890625, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.3545913728297836, "calib/mean_conf": 0.7405377777777777, "calib/mu_c": 0.8571589403973511, "calib/mu_w": 0.5025675675675675, "calib/nonempty_final_conf_rate": 0.87890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1395600000000001, "calib/std_conf": 0.384513475729114, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3302354227405248, "calib/step_q_c_n": 2058.0, "calib/step_q_gap": 0.061405127074263155, "calib/step_q_w": 0.26883029566626165, "calib/step_q_w_n": 2469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 720.8125, "completions/mean_terminated_length": 820.1244506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.14613333333333334, "grad_norm": 2.3515427112579346, "kl": 0.222381591796875, "learning_rate": 1.777777777777778e-06, "loss": -0.3275, "mask/has_final_conf_rate": 0.87890625, "mask/share_final_conf": 0.01775544509291649, "mask/share_reasoning": 0.7089648246765137, "mask/share_step_conf": 0.15218594670295715, "num_tokens": 41956965.0, "reward": 0.6170933246612549, "reward_std": 0.19505244493484497, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6883065700531006, "rewards/format_reward_step": 0.87890625, "rewards/step_margin_reward": 0.25213009119033813, "step": 137 }, { "adv/mean_abs_final_conf": 0.5493025779724121, "adv/mean_abs_reasoning": 0.35511231422424316, "adv/mean_abs_step_conf": 0.632819652557373, "adv/ratio_final_to_reasoning": 1.5468418186859705, "adv/ratio_step_to_reasoning": 1.7820267763448094, "adv/std_final_conf": 0.7381003499031067, "adv/std_reasoning": 0.6405137181282043, "adv/std_step_conf": 0.843903660774231, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 14.47265625, "calib/ece": 0.2170791666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8041666666666667, "calib/gap": 0.2905591830038552, "calib/mean_conf": 0.8499958333333333, "calib/mu_c": 0.9383742514970059, "calib/mu_w": 0.6478150684931507, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1856208333333334, "calib/std_conf": 0.3142465459624108, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31700620404411767, "calib/step_q_c_n": 2176.0, "calib/step_q_gap": -0.056702102038289126, "calib/step_q_w": 0.3737083060824068, "calib/step_q_w_n": 1529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 724.83984375, "completions/mean_terminated_length": 769.9544067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.1472, "grad_norm": 1.2674281597137451, "kl": 0.250762939453125, "learning_rate": 1.75e-06, "loss": -0.1559, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.021356452256441116, "mask/share_reasoning": 0.7491965889930725, "mask/share_step_conf": 0.17085321247577667, "num_tokens": 42246860.0, "reward": 0.6420503854751587, "reward_std": 0.2165650725364685, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.739525318145752, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.22582541406154633, "step": 138 }, { "adv/mean_abs_final_conf": 0.5015965104103088, "adv/mean_abs_reasoning": 0.2608618140220642, "adv/mean_abs_step_conf": 0.5609601140022278, "adv/ratio_final_to_reasoning": 1.922843756533422, "adv/ratio_step_to_reasoning": 2.150410998655329, "adv/std_final_conf": 0.7413507103919983, "adv/std_reasoning": 0.5483863949775696, "adv/std_step_conf": 0.8105617165565491, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.140625, "calib/ece": 0.30063399999999985, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.916, "calib/gap": 0.020221792658208892, "calib/mean_conf": 0.9404739999999999, "calib/mu_c": 0.9467023121387284, "calib/mu_w": 0.9264805194805195, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27455399999999985, "calib/std_conf": 0.1952396484426255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31638177863702277, "calib/step_q_c_n": 2069.0, "calib/step_q_gap": -0.008583572662303529, "calib/step_q_w": 0.3249653512993263, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 686.16796875, "completions/mean_terminated_length": 702.6360473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.14826666666666666, "grad_norm": 1.2241623401641846, "kl": 0.256683349609375, "learning_rate": 1.7222222222222224e-06, "loss": -0.0701, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02256494015455246, "mask/share_reasoning": 0.7774582505226135, "mask/share_step_conf": 0.1765393167734146, "num_tokens": 42525615.0, "reward": 0.6329671144485474, "reward_std": 0.17554357647895813, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.679322361946106, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2561429738998413, "step": 139 }, { "adv/mean_abs_final_conf": 0.46179768443107605, "adv/mean_abs_reasoning": 0.32739922404289246, "adv/mean_abs_step_conf": 0.6492865681648254, "adv/ratio_final_to_reasoning": 1.41050329542191, "adv/ratio_step_to_reasoning": 1.983164651849519, "adv/std_final_conf": 0.7165549397468567, "adv/std_reasoning": 0.6611654162406921, "adv/std_step_conf": 0.8758783936500549, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.390625, "calib/ece": 0.25146485943775093, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9156626506024096, "calib/gap": 0.13560537931034478, "calib/mean_conf": 0.9435130522088354, "calib/mu_c": 0.9843580459770115, "calib/mu_w": 0.8487526666666667, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24809136546184732, "calib/std_conf": 0.18642138216210663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.335719409465914, "calib/step_q_c_n": 2303.0, "calib/step_q_gap": 0.02131028564259757, "calib/step_q_w": 0.3144091238233164, "calib/step_q_w_n": 1381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 778.75, "completions/mean_terminated_length": 800.6425170898438, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.14933333333333335, "grad_norm": 1.2806999683380127, "kl": 0.2357177734375, "learning_rate": 1.6944444444444446e-06, "loss": -0.0411, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020671725273132324, "mask/share_reasoning": 0.7754533290863037, "mask/share_step_conf": 0.17653116583824158, "num_tokens": 42829991.0, "reward": 0.6468392610549927, "reward_std": 0.19622653722763062, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7314028143882751, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2318069040775299, "step": 140 }, { "adv/mean_abs_final_conf": 0.525245189666748, "adv/mean_abs_reasoning": 0.33174651861190796, "adv/mean_abs_step_conf": 0.5822703838348389, "adv/ratio_final_to_reasoning": 1.5832726500476213, "adv/ratio_step_to_reasoning": 1.7551665237397864, "adv/std_final_conf": 0.7608556151390076, "adv/std_reasoning": 0.6402722597122192, "adv/std_step_conf": 0.8274043798446655, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.44921875, "calib/ece": 0.23398999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.896, "calib/gap": 0.1750773809523808, "calib/mean_conf": 0.91907, "calib/mu_c": 0.9680916666666666, "calib/mu_w": 0.7930142857142858, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21652999999999997, "calib/std_conf": 0.2410115476486552, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32720863007959783, "calib/step_q_c_n": 2387.0, "calib/step_q_gap": -0.04450909095698752, "calib/step_q_w": 0.37171772103658535, "calib/step_q_w_n": 1312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 814.62109375, "completions/mean_terminated_length": 834.1720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.1504, "grad_norm": 0.9242333173751831, "kl": 0.212890625, "learning_rate": 1.6666666666666667e-06, "loss": -0.0614, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01946493610739708, "mask/share_reasoning": 0.7856360077857971, "mask/share_step_conf": 0.17146152257919312, "num_tokens": 43145630.0, "reward": 0.6645768880844116, "reward_std": 0.20759084820747375, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7531989812850952, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.24001729488372803, "step": 141 }, { "adv/mean_abs_final_conf": 0.5690226554870605, "adv/mean_abs_reasoning": 0.42950016260147095, "adv/mean_abs_step_conf": 0.6370354294776917, "adv/ratio_final_to_reasoning": 1.3248485216874089, "adv/ratio_step_to_reasoning": 1.4832018354060337, "adv/std_final_conf": 0.7847913503646851, "adv/std_reasoning": 0.7015143036842346, "adv/std_step_conf": 0.8598779439926147, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 16.06640625, "calib/ece": 0.2856208333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8708333333333333, "calib/gap": 0.18633911483253596, "calib/mean_conf": 0.9042875, "calib/mu_c": 0.9726118421052633, "calib/mu_w": 0.7862727272727273, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2782875000000001, "calib/std_conf": 0.25235399378072726, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36300235109717865, "calib/step_q_c_n": 1914.0, "calib/step_q_gap": -0.036242441990588514, "calib/step_q_w": 0.39924479308776717, "calib/step_q_w_n": 2199.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 833.3125, "completions/mean_terminated_length": 881.5206298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 0.15146666666666667, "grad_norm": 0.8023488521575928, "kl": 0.2142333984375, "learning_rate": 1.638888888888889e-06, "loss": -0.1356, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.019096028059720993, "mask/share_reasoning": 0.7625699043273926, "mask/share_step_conf": 0.16364656388759613, "num_tokens": 43464118.0, "reward": 0.6316433548927307, "reward_std": 0.24024856090545654, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6723967790603638, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.284639835357666, "step": 142 }, { "adv/mean_abs_final_conf": 0.5092817544937134, "adv/mean_abs_reasoning": 0.3617568016052246, "adv/mean_abs_step_conf": 0.6399293541908264, "adv/ratio_final_to_reasoning": 1.407801462844308, "adv/ratio_step_to_reasoning": 1.7689490601179185, "adv/std_final_conf": 0.7648191452026367, "adv/std_reasoning": 0.6815710663795471, "adv/std_step_conf": 0.8593825697898865, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.4375, "calib/ece": 0.24520800000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.768, "calib/gap": 0.22287359708193044, "calib/mean_conf": 0.8428880000000001, "calib/mu_c": 0.9213395061728396, "calib/mu_w": 0.6984659090909091, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22004800000000008, "calib/std_conf": 0.3032960425986465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34825138818778395, "calib/step_q_c_n": 1981.0, "calib/step_q_gap": 0.010504449412273742, "calib/step_q_w": 0.3377469387755102, "calib/step_q_w_n": 1715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 805.5546875, "completions/mean_terminated_length": 824.8880615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.15253333333333333, "grad_norm": 1.2207856178283691, "kl": 0.22381591796875, "learning_rate": 1.6111111111111113e-06, "loss": -0.0968, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019887078553438187, "mask/share_reasoning": 0.7877940535545349, "mask/share_step_conf": 0.168881356716156, "num_tokens": 43777676.0, "reward": 0.6582680940628052, "reward_std": 0.20583592355251312, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7261790037155151, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.26848214864730835, "step": 143 }, { "adv/mean_abs_final_conf": 0.47069185972213745, "adv/mean_abs_reasoning": 0.3482842445373535, "adv/mean_abs_step_conf": 0.6167457103729248, "adv/ratio_final_to_reasoning": 1.351458950856032, "adv/ratio_step_to_reasoning": 1.7708113991552632, "adv/std_final_conf": 0.7149190306663513, "adv/std_reasoning": 0.6403000950813293, "adv/std_step_conf": 0.8437038064002991, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.3515625, "calib/ece": 0.16172199999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.816, "calib/gap": 0.28898601578586147, "calib/mean_conf": 0.877882, "calib/mu_c": 0.9495505319148935, "calib/mu_w": 0.660564516129032, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14380199999999996, "calib/std_conf": 0.27151917625832617, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35282960776043865, "calib/step_q_c_n": 2371.0, "calib/step_q_gap": -0.02016370647069793, "calib/step_q_w": 0.3729933142311366, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 718.49609375, "completions/mean_terminated_length": 738.6947631835938, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.1536, "grad_norm": 1.1345633268356323, "kl": 0.2506103515625, "learning_rate": 1.5833333333333333e-06, "loss": -0.0998, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.022363772615790367, "mask/share_reasoning": 0.7747567892074585, "mask/share_step_conf": 0.17553575336933136, "num_tokens": 44065739.0, "reward": 0.7319254875183105, "reward_std": 0.17399638891220093, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.8122310042381287, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.3094324469566345, "step": 144 }, { "adv/mean_abs_final_conf": 0.6201844811439514, "adv/mean_abs_reasoning": 0.4980112612247467, "adv/mean_abs_step_conf": 0.6163833141326904, "adv/ratio_final_to_reasoning": 1.2453222033950542, "adv/ratio_step_to_reasoning": 1.2376895105079235, "adv/std_final_conf": 0.8292064666748047, "adv/std_reasoning": 0.7754777669906616, "adv/std_step_conf": 0.8439358472824097, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.78515625, "calib/ece": 0.2414574898785426, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8097165991902834, "calib/gap": 0.13474309664694284, "calib/mean_conf": 0.8806477732793522, "calib/mu_c": 0.923198224852071, "calib/mu_w": 0.7884551282051282, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21894736842105272, "calib/std_conf": 0.2584896951428465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3552679582063561, "calib/step_q_c_n": 2297.0, "calib/step_q_gap": -0.04995852028826753, "calib/step_q_w": 0.4052264784946236, "calib/step_q_w_n": 1488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 796.2421875, "completions/mean_terminated_length": 821.9273681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.15466666666666667, "grad_norm": 0.8662506937980652, "kl": 0.230865478515625, "learning_rate": 1.5555555555555558e-06, "loss": -0.0984, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01997159980237484, "mask/share_reasoning": 0.773935079574585, "mask/share_step_conf": 0.17484331130981445, "num_tokens": 44372281.0, "reward": 0.6226569414138794, "reward_std": 0.24305683374404907, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7108544111251831, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.20945948362350464, "step": 145 }, { "adv/mean_abs_final_conf": 0.6963587403297424, "adv/mean_abs_reasoning": 0.5124849081039429, "adv/mean_abs_step_conf": 0.6940493583679199, "adv/ratio_final_to_reasoning": 1.358788774690134, "adv/ratio_step_to_reasoning": 1.354282530847039, "adv/std_final_conf": 0.8616522550582886, "adv/std_reasoning": 0.7755056619644165, "adv/std_step_conf": 0.8758512735366821, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 15.2734375, "calib/ece": 0.297390041493776, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7468879668049793, "calib/gap": 0.18462630226915944, "calib/mean_conf": 0.8753153526970954, "calib/mu_c": 0.9503916083916084, "calib/mu_w": 0.765765306122449, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28967219917012454, "calib/std_conf": 0.2290983049539202, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34514803471158756, "calib/step_q_c_n": 1959.0, "calib/step_q_gap": -0.03828712674407625, "calib/step_q_w": 0.3834351614556638, "calib/step_q_w_n": 1951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 775.6171875, "completions/mean_terminated_length": 823.8921508789062, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.15573333333333333, "grad_norm": 1.391205072402954, "kl": 0.225616455078125, "learning_rate": 1.527777777777778e-06, "loss": -0.2554, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.019138246774673462, "mask/share_reasoning": 0.7537720799446106, "mask/share_step_conf": 0.16849590837955475, "num_tokens": 44678055.0, "reward": 0.6086395382881165, "reward_std": 0.2709296643733978, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6738836765289307, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.2433953881263733, "step": 146 }, { "adv/mean_abs_final_conf": 0.5746307373046875, "adv/mean_abs_reasoning": 0.40634775161743164, "adv/mean_abs_step_conf": 0.5668737888336182, "adv/ratio_final_to_reasoning": 1.414135392696084, "adv/ratio_step_to_reasoning": 1.3950459589778132, "adv/std_final_conf": 0.7812044620513916, "adv/std_reasoning": 0.7015212178230286, "adv/std_step_conf": 0.7938024997711182, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.22265625, "calib/ece": 0.19931224489795923, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6408163265306123, "calib/gap": 0.25400596760443295, "calib/mean_conf": 0.7928632653061224, "calib/mu_c": 0.8882450980392157, "calib/mu_w": 0.6342391304347827, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1838428571428572, "calib/std_conf": 0.2943720654822338, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3583738508682329, "calib/step_q_c_n": 1958.0, "calib/step_q_gap": -0.03442353475268212, "calib/step_q_w": 0.39279738562091504, "calib/step_q_w_n": 1683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 764.7734375, "completions/mean_terminated_length": 799.1101684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 261.0, "epoch": 0.1568, "grad_norm": 1.6528960466384888, "kl": 0.23443603515625, "learning_rate": 1.5e-06, "loss": -0.1632, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020229840651154518, "mask/share_reasoning": 0.7700471878051758, "mask/share_step_conf": 0.16675424575805664, "num_tokens": 44977517.0, "reward": 0.6164664626121521, "reward_std": 0.19761189818382263, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7365533709526062, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.1854419857263565, "step": 147 }, { "adv/mean_abs_final_conf": 0.5271502137184143, "adv/mean_abs_reasoning": 0.3972645401954651, "adv/mean_abs_step_conf": 0.5814423561096191, "adv/ratio_final_to_reasoning": 1.3269500808177892, "adv/ratio_step_to_reasoning": 1.4636150405559316, "adv/std_final_conf": 0.754736065864563, "adv/std_reasoning": 0.6613168120384216, "adv/std_step_conf": 0.8106851577758789, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.140625, "calib/ece": 0.13375510204081634, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.726530612244898, "calib/gap": 0.27694385686585565, "calib/mean_conf": 0.8571020408163265, "calib/mu_c": 0.9271857923497266, "calib/mu_w": 0.650241935483871, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12195918367346939, "calib/std_conf": 0.2374820504407635, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41528641425389756, "calib/step_q_c_n": 2245.0, "calib/step_q_gap": -0.003044494837011469, "calib/step_q_w": 0.41833090909090903, "calib/step_q_w_n": 1375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 701.2734375, "completions/mean_terminated_length": 732.7591552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.15786666666666666, "grad_norm": 1.5334172248840332, "kl": 0.25244140625, "learning_rate": 1.4722222222222225e-06, "loss": -0.0908, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02250342071056366, "mask/share_reasoning": 0.7561869025230408, "mask/share_step_conf": 0.17834091186523438, "num_tokens": 45262155.0, "reward": 0.7265738248825073, "reward_std": 0.17858600616455078, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.8107410073280334, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.3080315589904785, "step": 148 }, { "adv/mean_abs_final_conf": 0.6013184189796448, "adv/mean_abs_reasoning": 0.44574636220932007, "adv/mean_abs_step_conf": 0.685305118560791, "adv/ratio_final_to_reasoning": 1.3490147535904486, "adv/ratio_step_to_reasoning": 1.5374328915756255, "adv/std_final_conf": 0.8094089031219482, "adv/std_reasoning": 0.7206778526306152, "adv/std_step_conf": 0.891316831111908, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 15.16796875, "calib/ece": 0.1078151260504202, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6764705882352942, "calib/gap": 0.3305325670498085, "calib/mean_conf": 0.810672268907563, "calib/mu_c": 0.8912222222222224, "calib/mu_w": 0.5606896551724139, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08109243697478997, "calib/std_conf": 0.28979123489025993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37588042515500447, "calib/step_q_c_n": 2258.0, "calib/step_q_gap": -0.03427043691396109, "calib/step_q_w": 0.41015086206896556, "calib/step_q_w_n": 1624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 757.67578125, "completions/mean_terminated_length": 808.1875610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.15893333333333334, "grad_norm": 1.1942827701568604, "kl": 0.230194091796875, "learning_rate": 1.4444444444444445e-06, "loss": -0.1477, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.019954249262809753, "mask/share_reasoning": 0.7544931173324585, "mask/share_step_conf": 0.16305264830589294, "num_tokens": 45560576.0, "reward": 0.6850230693817139, "reward_std": 0.22184012830257416, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7878589630126953, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.2564058005809784, "step": 149 }, { "adv/mean_abs_final_conf": 0.5935416221618652, "adv/mean_abs_reasoning": 0.30029815435409546, "adv/mean_abs_step_conf": 0.6016957759857178, "adv/ratio_final_to_reasoning": 1.9765077259249246, "adv/ratio_step_to_reasoning": 2.003661252197476, "adv/std_final_conf": 0.8034288287162781, "adv/std_reasoning": 0.596147894859314, "adv/std_step_conf": 0.827460527420044, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 13.7734375, "calib/ece": 0.14944214876033057, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6942148760330579, "calib/gap": 0.2672503170577044, "calib/mean_conf": 0.8287809917355372, "calib/mu_c": 0.912710843373494, "calib/mu_w": 0.6454605263157895, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14613636363636365, "calib/std_conf": 0.26962259660006904, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4318985823336969, "calib/step_q_c_n": 1834.0, "calib/step_q_gap": -0.01707937274904553, "calib/step_q_w": 0.4489779550827424, "calib/step_q_w_n": 1692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 642.1953125, "completions/mean_terminated_length": 679.3471069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.16, "grad_norm": 1.7494480609893799, "kl": 0.28179931640625, "learning_rate": 1.4166666666666667e-06, "loss": -0.1658, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.023424550890922546, "mask/share_reasoning": 0.7520290613174438, "mask/share_step_conf": 0.1698589324951172, "num_tokens": 45829938.0, "reward": 0.6518896222114563, "reward_std": 0.17572720348834991, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7625117897987366, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.2225174605846405, "step": 150 }, { "adv/mean_abs_final_conf": 0.6138310432434082, "adv/mean_abs_reasoning": 0.4014820456504822, "adv/mean_abs_step_conf": 0.498024582862854, "adv/ratio_final_to_reasoning": 1.528912811652331, "adv/ratio_step_to_reasoning": 1.2404653913127133, "adv/std_final_conf": 0.8315180540084839, "adv/std_reasoning": 0.7014200091362, "adv/std_step_conf": 0.7584320902824402, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 16.0390625, "calib/ece": 0.14950840336134447, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5630252100840336, "calib/gap": 0.27281709956709954, "calib/mean_conf": 0.7419453781512605, "calib/mu_c": 0.8382337662337663, "calib/mu_w": 0.5654166666666668, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12219747899159655, "calib/std_conf": 0.31073299280052796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4112959486166008, "calib/step_q_c_n": 2024.0, "calib/step_q_gap": -0.0113418515755222, "calib/step_q_w": 0.422637800192123, "calib/step_q_w_n": 2082.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 813.1882934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.16106666666666666, "grad_norm": 1.1843147277832031, "kl": 0.229949951171875, "learning_rate": 1.3888888888888892e-06, "loss": -0.2223, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.019284039735794067, "mask/share_reasoning": 0.7494036555290222, "mask/share_step_conf": 0.1649060696363449, "num_tokens": 46131314.0, "reward": 0.5891934633255005, "reward_std": 0.1657007783651352, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7350819110870361, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.1370549350976944, "step": 151 }, { "adv/mean_abs_final_conf": 0.6327756643295288, "adv/mean_abs_reasoning": 0.4947320222854614, "adv/mean_abs_step_conf": 0.6735621690750122, "adv/ratio_final_to_reasoning": 1.279027101189775, "adv/ratio_step_to_reasoning": 1.3614687118157986, "adv/std_final_conf": 0.8337776064872742, "adv/std_reasoning": 0.7577208280563354, "adv/std_step_conf": 0.8758518695831299, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 14.60546875, "calib/ece": 0.121781512605042, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.542016806722689, "calib/gap": 0.27947975151372173, "calib/mean_conf": 0.7352268907563024, "calib/mu_c": 0.8303439490445859, "calib/mu_w": 0.5508641975308641, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09867226890756299, "calib/std_conf": 0.309116956924721, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4365304391217565, "calib/step_q_c_n": 2004.0, "calib/step_q_gap": 0.0100405255770879, "calib/step_q_w": 0.4264899135446686, "calib/step_q_w_n": 1735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 738.37890625, "completions/mean_terminated_length": 790.8995361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.16213333333333332, "grad_norm": 1.4904396533966064, "kl": 0.239471435546875, "learning_rate": 1.3611111111111112e-06, "loss": -0.2374, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.02063142880797386, "mask/share_reasoning": 0.7482746839523315, "mask/share_step_conf": 0.1646876186132431, "num_tokens": 46425731.0, "reward": 0.6711384057998657, "reward_std": 0.20814509689807892, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7434897422790527, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.28941214084625244, "step": 152 }, { "adv/mean_abs_final_conf": 0.6002535820007324, "adv/mean_abs_reasoning": 0.3623543977737427, "adv/mean_abs_step_conf": 0.620030403137207, "adv/ratio_final_to_reasoning": 1.6565373173020963, "adv/ratio_step_to_reasoning": 1.7111159874051247, "adv/std_final_conf": 0.8012081384658813, "adv/std_reasoning": 0.6612955927848816, "adv/std_step_conf": 0.8599606156349182, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 15.05078125, "calib/ece": 0.14890041493775932, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6390041493775933, "calib/gap": 0.26141066732412876, "calib/mean_conf": 0.7865767634854771, "calib/mu_c": 0.8646745562130177, "calib/mu_w": 0.6032638888888889, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1171161825726141, "calib/std_conf": 0.3065826960243486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4296149809160305, "calib/step_q_c_n": 2096.0, "calib/step_q_gap": 0.002485328098728312, "calib/step_q_w": 0.4271296528173022, "calib/step_q_w_n": 1757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 775.05078125, "completions/mean_terminated_length": 819.8883666992188, "completions/min_length": 0.0, "completions/min_terminated_length": 287.0, "epoch": 0.1632, "grad_norm": 1.214044213294983, "kl": 0.223876953125, "learning_rate": 1.3333333333333334e-06, "loss": -0.0627, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018918223679065704, "mask/share_reasoning": 0.7644103169441223, "mask/share_step_conf": 0.16198396682739258, "num_tokens": 46731464.0, "reward": 0.669912576675415, "reward_std": 0.19476324319839478, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.749024510383606, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.2712693512439728, "step": 153 }, { "adv/mean_abs_final_conf": 0.5115103721618652, "adv/mean_abs_reasoning": 0.4170297682285309, "adv/mean_abs_step_conf": 0.6089221239089966, "adv/ratio_final_to_reasoning": 1.2265560186139022, "adv/ratio_step_to_reasoning": 1.4601406669255068, "adv/std_final_conf": 0.7640756368637085, "adv/std_reasoning": 0.7015310525894165, "adv/std_step_conf": 0.8599308729171753, "calib/answer_extract_rate": 0.8984375, "calib/avg_num_step_conf": 15.87890625, "calib/ece": 0.17054347826086955, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.4025906394199076, "calib/mean_conf": 0.8080217391304347, "calib/mu_c": 0.9515540540540539, "calib/mu_w": 0.5489634146341463, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16754347826086954, "calib/std_conf": 0.313698656920136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4056774020397209, "calib/step_q_c_n": 1863.0, "calib/step_q_gap": -0.036074641557009346, "calib/step_q_w": 0.44175204359673026, "calib/step_q_w_n": 2202.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 703.546875, "completions/mean_terminated_length": 776.3275756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.16426666666666667, "grad_norm": 0.8250810503959656, "kl": 0.239288330078125, "learning_rate": 1.3055555555555556e-06, "loss": -0.2055, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.01885397545993328, "mask/share_reasoning": 0.7259734869003296, "mask/share_step_conf": 0.16142255067825317, "num_tokens": 47016012.0, "reward": 0.5886725783348083, "reward_std": 0.21505558490753174, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7455453872680664, "rewards/format_reward_step": 0.8984375, "rewards/step_margin_reward": 0.13648727536201477, "step": 154 }, { "adv/mean_abs_final_conf": 0.6118817925453186, "adv/mean_abs_reasoning": 0.47593390941619873, "adv/mean_abs_step_conf": 0.6540273427963257, "adv/ratio_final_to_reasoning": 1.2856444570126964, "adv/ratio_step_to_reasoning": 1.3741978242285449, "adv/std_final_conf": 0.8000847697257996, "adv/std_reasoning": 0.7207589149475098, "adv/std_step_conf": 0.8598897457122803, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 14.05078125, "calib/ece": 0.2131818181818182, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6735537190082644, "calib/gap": 0.30535830266291397, "calib/mean_conf": 0.8160743801652893, "calib/mu_c": 0.9334228187919462, "calib/mu_w": 0.6280645161290322, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20677685950413222, "calib/std_conf": 0.28303673987749994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.404281045751634, "calib/step_q_c_n": 1836.0, "calib/step_q_gap": 0.005331017358675416, "calib/step_q_w": 0.39895002839295857, "calib/step_q_w_n": 1761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 698.140625, "completions/mean_terminated_length": 738.5288696289062, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.16533333333333333, "grad_norm": 1.3057198524475098, "kl": 0.243377685546875, "learning_rate": 1.2777777777777779e-06, "loss": -0.1336, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.020220771431922913, "mask/share_reasoning": 0.753272294998169, "mask/share_step_conf": 0.17181944847106934, "num_tokens": 47301952.0, "reward": 0.629758358001709, "reward_std": 0.2366112768650055, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7445582151412964, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.20948976278305054, "step": 155 }, { "adv/mean_abs_final_conf": 0.5225203037261963, "adv/mean_abs_reasoning": 0.36599862575531006, "adv/mean_abs_step_conf": 0.6580045223236084, "adv/ratio_final_to_reasoning": 1.4276564635942908, "adv/ratio_step_to_reasoning": 1.797833314170748, "adv/std_final_conf": 0.7769670486450195, "adv/std_reasoning": 0.6816434860229492, "adv/std_step_conf": 0.8757668137550354, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 13.703125, "calib/ece": 0.20048387096774195, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.657258064516129, "calib/gap": 0.21715750232991615, "calib/mean_conf": 0.7933064516129033, "calib/mu_c": 0.858103448275862, "calib/mu_w": 0.6409459459459459, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14608870967741938, "calib/std_conf": 0.31163938727367596, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42193717277486914, "calib/step_q_c_n": 2101.0, "calib/step_q_gap": 0.02631172856733538, "calib/step_q_w": 0.39562544420753376, "calib/step_q_w_n": 1407.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 771.7578125, "completions/mean_terminated_length": 793.4537963867188, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.1664, "grad_norm": 1.2064683437347412, "kl": 0.23663330078125, "learning_rate": 1.25e-06, "loss": -0.0808, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.021276123821735382, "mask/share_reasoning": 0.779464840888977, "mask/share_step_conf": 0.17191532254219055, "num_tokens": 47604282.0, "reward": 0.6614023447036743, "reward_std": 0.17743799090385437, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7517943382263184, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.24132287502288818, "step": 156 }, { "adv/mean_abs_final_conf": 0.529394805431366, "adv/mean_abs_reasoning": 0.47357791662216187, "adv/mean_abs_step_conf": 0.6258033514022827, "adv/ratio_final_to_reasoning": 1.1178621022013087, "adv/ratio_step_to_reasoning": 1.3214369366415621, "adv/std_final_conf": 0.777289867401123, "adv/std_reasoning": 0.757584273815155, "adv/std_step_conf": 0.8599728941917419, "calib/answer_extract_rate": 0.92578125, "calib/avg_num_step_conf": 15.8203125, "calib/ece": 0.16551476793248956, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.7805907172995781, "calib/gap": 0.3085774451792551, "calib/mean_conf": 0.8546286919831223, "calib/mu_c": 0.9431656804733728, "calib/mu_w": 0.6345882352941177, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15353164556962037, "calib/std_conf": 0.29181688470317674, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4209602349751469, "calib/step_q_c_n": 2213.0, "calib/step_q_gap": -0.069750706777711, "calib/step_q_w": 0.4907109417528579, "calib/step_q_w_n": 1837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 703.27734375, "completions/mean_terminated_length": 756.4664306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.16746666666666668, "grad_norm": 1.2208685874938965, "kl": 0.237213134765625, "learning_rate": 1.2222222222222223e-06, "loss": -0.1109, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.021750878542661667, "mask/share_reasoning": 0.7368344068527222, "mask/share_step_conf": 0.17110225558280945, "num_tokens": 47888049.0, "reward": 0.629758358001709, "reward_std": 0.22569067776203156, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7558800578117371, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.18644914031028748, "step": 157 }, { "adv/mean_abs_final_conf": 0.499210000038147, "adv/mean_abs_reasoning": 0.45436128973960876, "adv/mean_abs_step_conf": 0.7095964550971985, "adv/ratio_final_to_reasoning": 1.098707155101705, "adv/ratio_step_to_reasoning": 1.5617449618207202, "adv/std_final_conf": 0.7592298984527588, "adv/std_reasoning": 0.7206823825836182, "adv/std_step_conf": 0.8913768529891968, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 13.5546875, "calib/ece": 0.20483669354838707, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8185483870967742, "calib/gap": 0.20857639921722104, "calib/mean_conf": 0.886530241935484, "calib/mu_c": 0.9479257142857143, "calib/mu_w": 0.7393493150684932, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19286088709677415, "calib/std_conf": 0.25592254469626635, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42429733840304185, "calib/step_q_c_n": 2104.0, "calib/step_q_gap": -0.057404345345128005, "calib/step_q_w": 0.48170168374816985, "calib/step_q_w_n": 1366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 706.2109375, "completions/mean_terminated_length": 728.991943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.16853333333333334, "grad_norm": 1.058288335800171, "kl": 0.25054931640625, "learning_rate": 1.1944444444444446e-06, "loss": -0.0759, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0231400728225708, "mask/share_reasoning": 0.7669098377227783, "mask/share_step_conf": 0.1787000596523285, "num_tokens": 48174079.0, "reward": 0.6625410914421082, "reward_std": 0.2515299916267395, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7563234567642212, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.23828986287117004, "step": 158 }, { "adv/mean_abs_final_conf": 0.4797705411911011, "adv/mean_abs_reasoning": 0.42208725214004517, "adv/mean_abs_step_conf": 0.6270284652709961, "adv/ratio_final_to_reasoning": 1.1366620023670297, "adv/ratio_step_to_reasoning": 1.4855422950867825, "adv/std_final_conf": 0.7398706078529358, "adv/std_reasoning": 0.7013546228408813, "adv/std_step_conf": 0.860073447227478, "calib/answer_extract_rate": 0.94921875, "calib/avg_num_step_conf": 14.24609375, "calib/ece": 0.17094650205761305, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7613168724279835, "calib/gap": 0.1921494607087827, "calib/mean_conf": 0.8795061728395062, "calib/mu_c": 0.9316949152542373, "calib/mu_w": 0.7395454545454546, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16102880658436203, "calib/std_conf": 0.24471820310939163, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.418492017208413, "calib/step_q_c_n": 2092.0, "calib/step_q_gap": -0.09835685738965771, "calib/step_q_w": 0.5168488745980707, "calib/step_q_w_n": 1555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 673.609375, "completions/mean_terminated_length": 709.6460571289062, "completions/min_length": 0.0, "completions/min_terminated_length": 289.0, "epoch": 0.1696, "grad_norm": 1.5005854368209839, "kl": 0.256011962890625, "learning_rate": 1.1666666666666668e-06, "loss": -0.1241, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.022273514419794083, "mask/share_reasoning": 0.7569800615310669, "mask/share_step_conf": 0.16996517777442932, "num_tokens": 48451307.0, "reward": 0.6668627262115479, "reward_std": 0.2029997706413269, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7550758123397827, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.25052475929260254, "step": 159 }, { "adv/mean_abs_final_conf": 0.49061745405197144, "adv/mean_abs_reasoning": 0.4065553843975067, "adv/mean_abs_step_conf": 0.6536253690719604, "adv/ratio_final_to_reasoning": 1.2067665879743303, "adv/ratio_step_to_reasoning": 1.6077154408878342, "adv/std_final_conf": 0.7587159872055054, "adv/std_reasoning": 0.6817222237586975, "adv/std_step_conf": 0.8598037958145142, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 13.7578125, "calib/ece": 0.22365145228215771, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7385892116182573, "calib/gap": 0.16865381526104417, "calib/mean_conf": 0.8376348547717842, "calib/mu_c": 0.8901204819277108, "calib/mu_w": 0.7214666666666666, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18624481327800835, "calib/std_conf": 0.2938403034479326, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4180683289124668, "calib/step_q_c_n": 1885.0, "calib/step_q_gap": 0.013894840824501042, "calib/step_q_w": 0.40417348808796577, "calib/step_q_w_n": 1637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 672.109375, "completions/mean_terminated_length": 713.9419555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.17066666666666666, "grad_norm": 1.0761736631393433, "kl": 0.245941162109375, "learning_rate": 1.138888888888889e-06, "loss": -0.0619, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.020725876092910767, "mask/share_reasoning": 0.7537473440170288, "mask/share_step_conf": 0.166933074593544, "num_tokens": 48728207.0, "reward": 0.5751692652702332, "reward_std": 0.19423778355121613, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7055008411407471, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.1276501715183258, "step": 160 }, { "adv/mean_abs_final_conf": 0.3587230443954468, "adv/mean_abs_reasoning": 0.23824524879455566, "adv/mean_abs_step_conf": 0.6313453912734985, "adv/ratio_final_to_reasoning": 1.5056881352743445, "adv/ratio_step_to_reasoning": 2.649981036213327, "adv/std_final_conf": 0.6413493156433105, "adv/std_reasoning": 0.5482035875320435, "adv/std_step_conf": 0.8753535151481628, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.12109375, "calib/ece": 0.11637795275590558, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8385826771653543, "calib/gap": 0.2968382279741165, "calib/mean_conf": 0.8893700787401575, "calib/mu_c": 0.9466341463414635, "calib/mu_w": 0.649795918367347, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0993307086614174, "calib/std_conf": 0.26446052064293823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3838304777594728, "calib/step_q_c_n": 2428.0, "calib/step_q_gap": 0.008682329611324702, "calib/step_q_w": 0.3751481481481481, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 696.765625, "completions/mean_terminated_length": 702.251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.17173333333333332, "grad_norm": 1.3555985689163208, "kl": 0.246795654296875, "learning_rate": 1.111111111111111e-06, "loss": -0.0074, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023145297542214394, "mask/share_reasoning": 0.7910975217819214, "mask/share_step_conf": 0.17794470489025116, "num_tokens": 49010499.0, "reward": 0.7200655341148376, "reward_std": 0.16070488095283508, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8533073663711548, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.22822979092597961, "step": 161 }, { "adv/mean_abs_final_conf": 0.37374114990234375, "adv/mean_abs_reasoning": 0.28832659125328064, "adv/mean_abs_step_conf": 0.599334716796875, "adv/ratio_final_to_reasoning": 1.2962423905397982, "adv/ratio_step_to_reasoning": 2.0786661202205563, "adv/std_final_conf": 0.6621988415718079, "adv/std_reasoning": 0.5960350632667542, "adv/std_step_conf": 0.827337920665741, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.45703125, "calib/ece": 0.1290200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.852, "calib/gap": 0.23135292923139394, "calib/mean_conf": 0.9166200000000001, "calib/mu_c": 0.9619651741293532, "calib/mu_w": 0.7306122448979593, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12082000000000012, "calib/std_conf": 0.21414919005216898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36848850325379606, "calib/step_q_c_n": 2305.0, "calib/step_q_gap": -0.05397360081860214, "calib/step_q_w": 0.4224621040723982, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 676.40234375, "completions/mean_terminated_length": 692.6360473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.1728, "grad_norm": 1.0031044483184814, "kl": 0.25445556640625, "learning_rate": 1.0833333333333335e-06, "loss": -0.0842, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.023875439539551735, "mask/share_reasoning": 0.7759991884231567, "mask/share_step_conf": 0.1766878217458725, "num_tokens": 49287802.0, "reward": 0.7327048778533936, "reward_std": 0.16267752647399902, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.8367069363594055, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2763589918613434, "step": 162 }, { "adv/mean_abs_final_conf": 0.4780116081237793, "adv/mean_abs_reasoning": 0.4224798083305359, "adv/mean_abs_step_conf": 0.5790718793869019, "adv/ratio_final_to_reasoning": 1.131442494287909, "adv/ratio_step_to_reasoning": 1.3706498345451172, "adv/std_final_conf": 0.7584898471832275, "adv/std_reasoning": 0.7206315994262695, "adv/std_step_conf": 0.8108139038085938, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.765625, "calib/ece": 0.19218623481781372, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7206477732793523, "calib/gap": 0.41208835904628327, "calib/mean_conf": 0.8083805668016194, "calib/mu_c": 0.9618709677419355, "calib/mu_w": 0.5497826086956522, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18651821862348172, "calib/std_conf": 0.3330636358615001, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40701116725619, "calib/step_q_c_n": 1979.0, "calib/step_q_gap": -0.03158239187762446, "calib/step_q_w": 0.4385935591338145, "calib/step_q_w_n": 1801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 760.453125, "completions/mean_terminated_length": 788.1619873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.17386666666666667, "grad_norm": 0.8664852380752563, "kl": 0.232330322265625, "learning_rate": 1.0555555555555557e-06, "loss": -0.1173, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02123209834098816, "mask/share_reasoning": 0.7678343057632446, "mask/share_step_conf": 0.17577733099460602, "num_tokens": 49587310.0, "reward": 0.676498532295227, "reward_std": 0.19726118445396423, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7866039276123047, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.2523307800292969, "step": 163 }, { "adv/mean_abs_final_conf": 0.4758383333683014, "adv/mean_abs_reasoning": 0.38787057995796204, "adv/mean_abs_step_conf": 0.6178101301193237, "adv/ratio_final_to_reasoning": 1.226796663515633, "adv/ratio_step_to_reasoning": 1.5928254475662549, "adv/std_final_conf": 0.7404839396476746, "adv/std_reasoning": 0.6816709637641907, "adv/std_step_conf": 0.8437954783439636, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 14.21875, "calib/ece": 0.17573170731707322, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7073170731707317, "calib/gap": 0.40962879064669333, "calib/mean_conf": 0.8125609756097562, "calib/mu_c": 0.9540993788819875, "calib/mu_w": 0.5444705882352942, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1669105691056911, "calib/std_conf": 0.316181000605292, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4039997516145057, "calib/step_q_c_n": 2013.0, "calib/step_q_gap": 0.022508049094530358, "calib/step_q_w": 0.38149170251997533, "calib/step_q_w_n": 1627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 801.28515625, "completions/mean_terminated_length": 830.4818115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.17493333333333333, "grad_norm": 1.2860913276672363, "kl": 0.222442626953125, "learning_rate": 1.0277777777777777e-06, "loss": -0.1324, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01892763376235962, "mask/share_reasoning": 0.7816683053970337, "mask/share_step_conf": 0.1642477661371231, "num_tokens": 49898575.0, "reward": 0.6890560984611511, "reward_std": 0.20963336527347565, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.8015799522399902, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2585635185241699, "step": 164 }, { "adv/mean_abs_final_conf": 0.503420352935791, "adv/mean_abs_reasoning": 0.4133051931858063, "adv/mean_abs_step_conf": 0.6522657871246338, "adv/ratio_final_to_reasoning": 1.2180353918501876, "adv/ratio_step_to_reasoning": 1.5781698315883486, "adv/std_final_conf": 0.7587146759033203, "adv/std_reasoning": 0.7014722228050232, "adv/std_step_conf": 0.8600209951400757, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 14.3125, "calib/ece": 0.21150000000000008, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.743801652892562, "calib/gap": 0.304159090909091, "calib/mean_conf": 0.8329876033057851, "calib/mu_c": 0.9435909090909091, "calib/mu_w": 0.6394318181818182, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20406198347107446, "calib/std_conf": 0.3019152976478005, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.361900518134715, "calib/step_q_c_n": 1930.0, "calib/step_q_gap": -0.0336219732147659, "calib/step_q_w": 0.3955224913494809, "calib/step_q_w_n": 1734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 752.28125, "completions/mean_terminated_length": 795.8016357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 281.0, "epoch": 0.176, "grad_norm": 0.7004601955413818, "kl": 0.222503662109375, "learning_rate": 1.0000000000000002e-06, "loss": -0.1832, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019354721531271935, "mask/share_reasoning": 0.7650870084762573, "mask/share_step_conf": 0.16087083518505096, "num_tokens": 50196735.0, "reward": 0.6619113683700562, "reward_std": 0.22242799401283264, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7369174957275391, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.2775302827358246, "step": 165 }, { "adv/mean_abs_final_conf": 0.44442370533943176, "adv/mean_abs_reasoning": 0.3372817635536194, "adv/mean_abs_step_conf": 0.6248372793197632, "adv/ratio_final_to_reasoning": 1.3176630146170933, "adv/ratio_step_to_reasoning": 1.8525676358438208, "adv/std_final_conf": 0.721575140953064, "adv/std_reasoning": 0.6404114365577698, "adv/std_step_conf": 0.8596287369728088, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.265625, "calib/ece": 0.12714634146341472, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7276422764227642, "calib/gap": 0.44507352316192117, "calib/mean_conf": 0.8217804878048781, "calib/mu_c": 0.9393812154696134, "calib/mu_w": 0.4943076923076922, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10657723577235781, "calib/std_conf": 0.30992035527074396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37760216216216214, "calib/step_q_c_n": 2405.0, "calib/step_q_gap": -0.01938519645393899, "calib/step_q_w": 0.3969873586161011, "calib/step_q_w_n": 1503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 763.8359375, "completions/mean_terminated_length": 794.8861694335938, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.17706666666666668, "grad_norm": 0.934593915939331, "kl": 0.22467041015625, "learning_rate": 9.722222222222224e-07, "loss": -0.1519, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.020230621099472046, "mask/share_reasoning": 0.7602050304412842, "mask/share_step_conf": 0.18050184845924377, "num_tokens": 50498461.0, "reward": 0.7322015762329102, "reward_std": 0.18576332926750183, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.8410079479217529, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2898014187812805, "step": 166 }, { "adv/mean_abs_final_conf": 0.3625107407569885, "adv/mean_abs_reasoning": 0.3200824558734894, "adv/mean_abs_step_conf": 0.6579821705818176, "adv/ratio_final_to_reasoning": 1.1325542344009902, "adv/ratio_step_to_reasoning": 2.055664590507519, "adv/std_final_conf": 0.6561827659606934, "adv/std_reasoning": 0.6185396313667297, "adv/std_step_conf": 0.8756250143051147, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.37890625, "calib/ece": 0.16047430830039533, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.849802371541502, "calib/gap": 0.2274734785036293, "calib/mean_conf": 0.901699604743083, "calib/mu_c": 0.9502512562814072, "calib/mu_w": 0.7227777777777779, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.137806324110672, "calib/std_conf": 0.25181530004313113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3782980023734177, "calib/step_q_c_n": 2528.0, "calib/step_q_gap": -0.03730233207474282, "calib/step_q_w": 0.4156003344481605, "calib/step_q_w_n": 897.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 745.140625, "completions/mean_terminated_length": 753.976318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.17813333333333334, "grad_norm": 1.029808521270752, "kl": 0.228607177734375, "learning_rate": 9.444444444444445e-07, "loss": 0.0249, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02139473706483841, "mask/share_reasoning": 0.7871021628379822, "mask/share_step_conf": 0.17978432774543762, "num_tokens": 50794825.0, "reward": 0.6635854840278625, "reward_std": 0.1748582124710083, "rewards/accuracy_reward_step": 0.77734375, "rewards/final_brier_reward_step": 0.8220793008804321, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.1519666314125061, "step": 167 }, { "adv/mean_abs_final_conf": 0.5207133293151855, "adv/mean_abs_reasoning": 0.4379308223724365, "adv/mean_abs_step_conf": 0.6242270469665527, "adv/ratio_final_to_reasoning": 1.189031012921824, "adv/ratio_step_to_reasoning": 1.4254010338547976, "adv/std_final_conf": 0.7697093486785889, "adv/std_reasoning": 0.7206395268440247, "adv/std_step_conf": 0.84398353099823, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 15.6796875, "calib/ece": 0.17427385892116182, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7925311203319502, "calib/gap": 0.3150412274736485, "calib/mean_conf": 0.8546058091286306, "calib/mu_c": 0.9434971098265895, "calib/mu_w": 0.628455882352941, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1555186721991701, "calib/std_conf": 0.30617122614199294, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39054243860404997, "calib/step_q_c_n": 2321.0, "calib/step_q_gap": -0.01607640368773977, "calib/step_q_w": 0.40661884229178974, "calib/step_q_w_n": 1693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 812.65625, "completions/mean_terminated_length": 856.1316528320312, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.1792, "grad_norm": 1.4466100931167603, "kl": 0.209014892578125, "learning_rate": 9.166666666666666e-07, "loss": -0.0975, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01853233389556408, "mask/share_reasoning": 0.7595194578170776, "mask/share_step_conf": 0.17116698622703552, "num_tokens": 51107537.0, "reward": 0.6733118295669556, "reward_std": 0.2267422378063202, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7650150060653687, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.2581711411476135, "step": 168 }, { "adv/mean_abs_final_conf": 0.41139134764671326, "adv/mean_abs_reasoning": 0.33128395676612854, "adv/mean_abs_step_conf": 0.564087986946106, "adv/ratio_final_to_reasoning": 1.2418088447824744, "adv/ratio_step_to_reasoning": 1.7027325815971417, "adv/std_final_conf": 0.7026668190956116, "adv/std_reasoning": 0.6610427498817444, "adv/std_step_conf": 0.8105635046958923, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.74609375, "calib/ece": 0.17077777777777778, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7103174603174603, "calib/gap": 0.36775418835103735, "calib/mean_conf": 0.8118095238095238, "calib/mu_c": 0.9329349112426035, "calib/mu_w": 0.5651807228915662, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1559761904761905, "calib/std_conf": 0.3214193526014844, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37050913636363636, "calib/step_q_c_n": 2200.0, "calib/step_q_gap": 0.009315049934523423, "calib/step_q_w": 0.36119408642911294, "calib/step_q_w_n": 1319.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 765.328125, "completions/mean_terminated_length": 777.4762573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.18026666666666666, "grad_norm": 0.9083700776100159, "kl": 0.21929931640625, "learning_rate": 8.88888888888889e-07, "loss": -0.0294, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020419418811798096, "mask/share_reasoning": 0.7880574464797974, "mask/share_step_conf": 0.17589810490608215, "num_tokens": 51407645.0, "reward": 0.7167430520057678, "reward_std": 0.14973390102386475, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.8055509328842163, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2990289330482483, "step": 169 }, { "adv/mean_abs_final_conf": 0.5089055299758911, "adv/mean_abs_reasoning": 0.45473602414131165, "adv/mean_abs_step_conf": 0.5926600098609924, "adv/ratio_final_to_reasoning": 1.1191229701602572, "adv/ratio_step_to_reasoning": 1.303305606764113, "adv/std_final_conf": 0.7939143180847168, "adv/std_reasoning": 0.7393459677696228, "adv/std_step_conf": 0.827715277671814, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.0546875, "calib/ece": 0.12503238866396754, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6963562753036437, "calib/gap": 0.4970921227197347, "calib/mean_conf": 0.7695668016194331, "calib/mu_c": 0.9044055555555556, "calib/mu_w": 0.4073134328358209, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08292712550607281, "calib/std_conf": 0.36600016950823094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39332050159235665, "calib/step_q_c_n": 2512.0, "calib/step_q_gap": 0.024941664036469857, "calib/step_q_w": 0.3683788375558868, "calib/step_q_w_n": 1342.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 760.1640625, "completions/mean_terminated_length": 787.8623657226562, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.18133333333333335, "grad_norm": 1.3216930627822876, "kl": 0.21307373046875, "learning_rate": 8.611111111111112e-07, "loss": -0.1429, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01945631019771099, "mask/share_reasoning": 0.7675975561141968, "mask/share_step_conf": 0.1777898222208023, "num_tokens": 51706399.0, "reward": 0.7269425392150879, "reward_std": 0.22854897379875183, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.832879900932312, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.28741124272346497, "step": 170 }, { "adv/mean_abs_final_conf": 0.43825238943099976, "adv/mean_abs_reasoning": 0.3537520468235016, "adv/mean_abs_step_conf": 0.5394303798675537, "adv/ratio_final_to_reasoning": 1.2388688443396008, "adv/ratio_step_to_reasoning": 1.5248827101110545, "adv/std_final_conf": 0.7006688714027405, "adv/std_reasoning": 0.6611840724945068, "adv/std_step_conf": 0.7764024138450623, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 13.52734375, "calib/ece": 0.24796370967741943, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7298387096774194, "calib/gap": 0.27188741721854304, "calib/mean_conf": 0.8005443548387097, "calib/mu_c": 0.906887417218543, "calib/mu_w": 0.635, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21981854838709686, "calib/std_conf": 0.34674582533534126, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39197080173347776, "calib/step_q_c_n": 1846.0, "calib/step_q_gap": -0.025547441927623038, "calib/step_q_w": 0.4175182436611008, "calib/step_q_w_n": 1617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 728.2734375, "completions/mean_terminated_length": 751.76611328125, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.1824, "grad_norm": 1.237929344177246, "kl": 0.22076416015625, "learning_rate": 8.333333333333333e-07, "loss": -0.1205, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020590022206306458, "mask/share_reasoning": 0.7791228294372559, "mask/share_step_conf": 0.1690371185541153, "num_tokens": 51999733.0, "reward": 0.6143733263015747, "reward_std": 0.18360351026058197, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.711430549621582, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.20559734106063843, "step": 171 }, { "adv/mean_abs_final_conf": 0.42707115411758423, "adv/mean_abs_reasoning": 0.36983591318130493, "adv/mean_abs_step_conf": 0.6822973489761353, "adv/ratio_final_to_reasoning": 1.1547584723288373, "adv/ratio_step_to_reasoning": 1.8448650459795994, "adv/std_final_conf": 0.7022022008895874, "adv/std_reasoning": 0.6403645873069763, "adv/std_step_conf": 0.8912651538848877, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.93359375, "calib/ece": 0.18734, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.816, "calib/gap": 0.2514656964656967, "calib/mean_conf": 0.8747, "calib/mu_c": 0.9400810810810812, "calib/mu_w": 0.6886153846153845, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16102000000000002, "calib/std_conf": 0.28317911293031484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3708603583916084, "calib/step_q_c_n": 2288.0, "calib/step_q_gap": -0.056880923860150756, "calib/step_q_w": 0.42774128225175917, "calib/step_q_w_n": 1279.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 730.76171875, "completions/mean_terminated_length": 751.30517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.18346666666666667, "grad_norm": 1.075636863708496, "kl": 0.22918701171875, "learning_rate": 8.055555555555557e-07, "loss": -0.044, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021154478192329407, "mask/share_reasoning": 0.7770688533782959, "mask/share_step_conf": 0.1744329333305359, "num_tokens": 52290160.0, "reward": 0.7800790667533875, "reward_std": 0.18976718187332153, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7871382236480713, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.43317610025405884, "step": 172 }, { "adv/mean_abs_final_conf": 0.45317739248275757, "adv/mean_abs_reasoning": 0.3944540023803711, "adv/mean_abs_step_conf": 0.5952010750770569, "adv/ratio_final_to_reasoning": 1.1488725928701813, "adv/ratio_step_to_reasoning": 1.5089239087073727, "adv/std_final_conf": 0.7222513556480408, "adv/std_reasoning": 0.6815750002861023, "adv/std_step_conf": 0.8438770174980164, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.734375, "calib/ece": 0.21865737051792833, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.796812749003984, "calib/gap": 0.25239186228482, "calib/mean_conf": 0.85698406374502, "calib/mu_c": 0.9283777777777777, "calib/mu_w": 0.6759859154929577, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17925498007968133, "calib/std_conf": 0.3073001954190747, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40082438334007275, "calib/step_q_c_n": 2473.0, "calib/step_q_gap": 0.026532543463244462, "calib/step_q_w": 0.3742918398768283, "calib/step_q_w_n": 1299.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 806.23828125, "completions/mean_terminated_length": 822.298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.18453333333333333, "grad_norm": 1.289366602897644, "kl": 0.21563720703125, "learning_rate": 7.777777777777779e-07, "loss": -0.0774, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02132277935743332, "mask/share_reasoning": 0.7813730835914612, "mask/share_step_conf": 0.1777728945016861, "num_tokens": 52599717.0, "reward": 0.6934113502502441, "reward_std": 0.2096310555934906, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.770208477973938, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.27989545464515686, "step": 173 }, { "adv/mean_abs_final_conf": 0.6756685972213745, "adv/mean_abs_reasoning": 0.6307892799377441, "adv/mean_abs_step_conf": 0.6734330058097839, "adv/ratio_final_to_reasoning": 1.0711478757027382, "adv/ratio_step_to_reasoning": 1.067603758066796, "adv/std_final_conf": 0.8757950663566589, "adv/std_reasoning": 0.8431174159049988, "adv/std_step_conf": 0.876012921333313, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 16.109375, "calib/ece": 0.2903601694915255, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.7033898305084746, "calib/gap": 0.11443405889884761, "calib/mean_conf": 0.7849364406779661, "calib/mu_c": 0.8193636363636363, "calib/mu_w": 0.7049295774647887, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18807203389830512, "calib/std_conf": 0.34952009499837083, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36908685661764706, "calib/step_q_c_n": 2176.0, "calib/step_q_gap": -0.035138593253818184, "calib/step_q_w": 0.40422544987146525, "calib/step_q_w_n": 1945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 778.40625, "completions/mean_terminated_length": 840.81005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.1856, "grad_norm": 1.0049992799758911, "kl": 0.201873779296875, "learning_rate": 7.5e-07, "loss": -0.2353, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.017570119351148605, "mask/share_reasoning": 0.751032829284668, "mask/share_step_conf": 0.15717829763889313, "num_tokens": 52903221.0, "reward": 0.6363409757614136, "reward_std": 0.30102431774139404, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.652944028377533, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.3072379529476166, "step": 174 }, { "adv/mean_abs_final_conf": 0.5732844471931458, "adv/mean_abs_reasoning": 0.48877519369125366, "adv/mean_abs_step_conf": 0.670279860496521, "adv/ratio_final_to_reasoning": 1.1729000460593635, "adv/ratio_step_to_reasoning": 1.3713459053323378, "adv/std_final_conf": 0.7766127586364746, "adv/std_reasoning": 0.7209515571594238, "adv/std_step_conf": 0.8758025765419006, "calib/answer_extract_rate": 0.91796875, "calib/avg_num_step_conf": 18.12890625, "calib/ece": 0.24031914893617012, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.548936170212766, "calib/gap": 0.28600727272727267, "calib/mean_conf": 0.6724042553191488, "calib/mu_c": 0.80628, "calib/mu_w": 0.5202727272727273, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19040425531914884, "calib/std_conf": 0.39118405417042684, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3666872119815668, "calib/step_q_c_n": 1736.0, "calib/step_q_gap": -0.059375438620842824, "calib/step_q_w": 0.4260626506024096, "calib/step_q_w_n": 2905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 802.3515625, "completions/mean_terminated_length": 874.051025390625, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.18666666666666668, "grad_norm": 1.099468469619751, "kl": 0.203277587890625, "learning_rate": 7.222222222222222e-07, "loss": -0.2011, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.018801182508468628, "mask/share_reasoning": 0.7300344705581665, "mask/share_step_conf": 0.16913309693336487, "num_tokens": 53214447.0, "reward": 0.5531612634658813, "reward_std": 0.21131201088428497, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6615592241287231, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.1635132133960724, "step": 175 }, { "adv/mean_abs_final_conf": 0.4588373899459839, "adv/mean_abs_reasoning": 0.42444854974746704, "adv/mean_abs_step_conf": 0.6860179305076599, "adv/ratio_final_to_reasoning": 1.0810200440523994, "adv/ratio_step_to_reasoning": 1.6162569784154477, "adv/std_final_conf": 0.7221850752830505, "adv/std_reasoning": 0.7014881372451782, "adv/std_step_conf": 0.8756811022758484, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.48046875, "calib/ece": 0.1927714285714286, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.763265306122449, "calib/gap": 0.33404130201136184, "calib/mean_conf": 0.8189755102040817, "calib/mu_c": 0.9253233532934132, "calib/mu_w": 0.5912820512820514, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1650571428571429, "calib/std_conf": 0.3378478334752546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37296883116883117, "calib/step_q_c_n": 2310.0, "calib/step_q_gap": -0.022045929871701175, "calib/step_q_w": 0.39501476104053235, "calib/step_q_w_n": 1653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 778.71875, "completions/mean_terminated_length": 813.6815795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.18773333333333334, "grad_norm": 1.1529240608215332, "kl": 0.2037353515625, "learning_rate": 6.944444444444446e-07, "loss": -0.249, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02091934159398079, "mask/share_reasoning": 0.7594068646430969, "mask/share_step_conf": 0.17670507729053497, "num_tokens": 53517863.0, "reward": 0.7074384689331055, "reward_std": 0.21603649854660034, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7608078122138977, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.33219408988952637, "step": 176 }, { "adv/mean_abs_final_conf": 0.48085418343544006, "adv/mean_abs_reasoning": 0.38578104972839355, "adv/mean_abs_step_conf": 0.6780802607536316, "adv/ratio_final_to_reasoning": 1.246443244876808, "adv/ratio_step_to_reasoning": 1.757681620782123, "adv/std_final_conf": 0.7400456070899963, "adv/std_reasoning": 0.6816667318344116, "adv/std_step_conf": 0.875238835811615, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 14.42578125, "calib/ece": 0.18491701244813277, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6970954356846473, "calib/gap": 0.3424996165055991, "calib/mean_conf": 0.7964522821576763, "calib/mu_c": 0.912987421383648, "calib/mu_w": 0.5704878048780488, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1608091286307054, "calib/std_conf": 0.3339682924127909, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3727267054468535, "calib/step_q_c_n": 1891.0, "calib/step_q_gap": -0.009210586451037728, "calib/step_q_w": 0.3819372918978912, "calib/step_q_w_n": 1802.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 702.4140625, "completions/mean_terminated_length": 746.1328125, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.1888, "grad_norm": 1.0421947240829468, "kl": 0.227264404296875, "learning_rate": 6.666666666666667e-07, "loss": -0.2115, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02060539461672306, "mask/share_reasoning": 0.7536983489990234, "mask/share_step_conf": 0.16710247099399567, "num_tokens": 53801513.0, "reward": 0.7175785303115845, "reward_std": 0.1975366175174713, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7486916780471802, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.37474653124809265, "step": 177 }, { "adv/mean_abs_final_conf": 0.5379396080970764, "adv/mean_abs_reasoning": 0.4493616223335266, "adv/mean_abs_step_conf": 0.6565547585487366, "adv/ratio_final_to_reasoning": 1.1971196055941893, "adv/ratio_step_to_reasoning": 1.4610832922029697, "adv/std_final_conf": 0.7765239477157593, "adv/std_reasoning": 0.7393073439598083, "adv/std_step_conf": 0.860089123249054, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.96484375, "calib/ece": 0.1662753036437247, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7246963562753036, "calib/gap": 0.3089900768245838, "calib/mean_conf": 0.81834008097166, "calib/mu_c": 0.9071590909090909, "calib/mu_w": 0.5981690140845071, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1360323886639676, "calib/std_conf": 0.3129141429580187, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3645833405358686, "calib/step_q_c_n": 2314.0, "calib/step_q_gap": -0.025199058936774765, "calib/step_q_w": 0.38978239947264337, "calib/step_q_w_n": 1517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 742.91015625, "completions/mean_terminated_length": 769.9797973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.18986666666666666, "grad_norm": 1.3304215669631958, "kl": 0.22064208984375, "learning_rate": 6.388888888888889e-07, "loss": -0.156, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02037697285413742, "mask/share_reasoning": 0.7630996704101562, "mask/share_step_conf": 0.18136712908744812, "num_tokens": 54097770.0, "reward": 0.696796178817749, "reward_std": 0.23738355934619904, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7840775847434998, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.2790459394454956, "step": 178 }, { "adv/mean_abs_final_conf": 0.5894767045974731, "adv/mean_abs_reasoning": 0.4320108890533447, "adv/mean_abs_step_conf": 0.6031880974769592, "adv/ratio_final_to_reasoning": 1.364495015135335, "adv/ratio_step_to_reasoning": 1.3962335504984864, "adv/std_final_conf": 0.8110029101371765, "adv/std_reasoning": 0.7014033794403076, "adv/std_step_conf": 0.8274986147880554, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.6015625, "calib/ece": 0.15346530612244896, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6653061224489796, "calib/gap": 0.39412153110047843, "calib/mean_conf": 0.7682816326530612, "calib/mu_c": 0.856757894736842, "calib/mu_w": 0.4626363636363636, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07311836734693877, "calib/std_conf": 0.34810459117531684, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37205936091765673, "calib/step_q_c_n": 2441.0, "calib/step_q_gap": 0.02078179730932983, "calib/step_q_w": 0.3512775636083269, "calib/step_q_w_n": 1297.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 774.578125, "completions/mean_terminated_length": 809.3550415039062, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.19093333333333334, "grad_norm": 1.688877820968628, "kl": 0.205169677734375, "learning_rate": 6.111111111111112e-07, "loss": -0.0878, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019409772008657455, "mask/share_reasoning": 0.7689638137817383, "mask/share_step_conf": 0.16865769028663635, "num_tokens": 54402326.0, "reward": 0.6934785842895508, "reward_std": 0.20034080743789673, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.8057296872138977, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2413836568593979, "step": 179 }, { "adv/mean_abs_final_conf": 0.4851997494697571, "adv/mean_abs_reasoning": 0.3911029100418091, "adv/mean_abs_step_conf": 0.5557488203048706, "adv/ratio_final_to_reasoning": 1.2405935548213871, "adv/ratio_step_to_reasoning": 1.4209784842701907, "adv/std_final_conf": 0.75730961561203, "adv/std_reasoning": 0.6818585395812988, "adv/std_step_conf": 0.8105658888816833, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 17.41015625, "calib/ece": 0.17533898305084752, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.7076271186440678, "calib/gap": 0.32153015985162936, "calib/mean_conf": 0.7915169491525423, "calib/mu_c": 0.8827988165680474, "calib/mu_w": 0.561268656716418, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1253771186440679, "calib/std_conf": 0.3434018433030211, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3868223106060606, "calib/step_q_c_n": 2640.0, "calib/step_q_gap": -0.04154697943246444, "calib/step_q_w": 0.42836929003852503, "calib/step_q_w_n": 1817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 861.81640625, "completions/mean_terminated_length": 926.995849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.192, "grad_norm": 1.3137576580047607, "kl": 0.18927001953125, "learning_rate": 5.833333333333334e-07, "loss": -0.232, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01645885594189167, "mask/share_reasoning": 0.7454394102096558, "mask/share_step_conf": 0.1677892506122589, "num_tokens": 54726807.0, "reward": 0.6496822237968445, "reward_std": 0.1653367280960083, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7410233020782471, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.2419348955154419, "step": 180 }, { "adv/mean_abs_final_conf": 0.5677691698074341, "adv/mean_abs_reasoning": 0.5137337446212769, "adv/mean_abs_step_conf": 0.6264705657958984, "adv/ratio_final_to_reasoning": 1.1051817712032757, "adv/ratio_step_to_reasoning": 1.2194460113920118, "adv/std_final_conf": 0.8043436408042908, "adv/std_reasoning": 0.7756029963493347, "adv/std_step_conf": 0.8440066576004028, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 17.52734375, "calib/ece": 0.17845726495726488, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.6965811965811965, "calib/gap": 0.3656142913541255, "calib/mean_conf": 0.7876880341880341, "calib/mu_c": 0.920496644295302, "calib/mu_w": 0.5548823529411765, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16469658119658115, "calib/std_conf": 0.3390760461467229, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3883713749413421, "calib/step_q_c_n": 2131.0, "calib/step_q_gap": -0.09216385426069529, "calib/step_q_w": 0.4805352292020374, "calib/step_q_w_n": 2356.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 730.82421875, "completions/mean_terminated_length": 796.1318969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.19306666666666666, "grad_norm": 1.38997483253479, "kl": 0.220367431640625, "learning_rate": 5.555555555555555e-07, "loss": -0.2918, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.01842355541884899, "mask/share_reasoning": 0.731447696685791, "mask/share_step_conf": 0.16809748113155365, "num_tokens": 55020162.0, "reward": 0.6222001314163208, "reward_std": 0.24766887724399567, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7313224673271179, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.21385908126831055, "step": 181 }, { "adv/mean_abs_final_conf": 0.40869730710983276, "adv/mean_abs_reasoning": 0.3939915895462036, "adv/mean_abs_step_conf": 0.6618375778198242, "adv/ratio_final_to_reasoning": 1.0373249530036088, "adv/ratio_step_to_reasoning": 1.6798266647826758, "adv/std_final_conf": 0.6826504468917847, "adv/std_reasoning": 0.6816443800926208, "adv/std_step_conf": 0.8758221864700317, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.66015625, "calib/ece": 0.17861788617886182, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7642276422764228, "calib/gap": 0.33253009088676, "calib/mean_conf": 0.827520325203252, "calib/mu_c": 0.9207909604519774, "calib/mu_w": 0.5882608695652174, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14331300813008133, "calib/std_conf": 0.324837904532449, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34900135964912277, "calib/step_q_c_n": 2280.0, "calib/step_q_gap": -0.04738846105648736, "calib/step_q_w": 0.39638982070561013, "calib/step_q_w_n": 1729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 775.29296875, "completions/mean_terminated_length": 806.8088989257812, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.19413333333333332, "grad_norm": 0.639399528503418, "kl": 0.210113525390625, "learning_rate": 5.277777777777779e-07, "loss": -0.0989, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019323285669088364, "mask/share_reasoning": 0.7690304517745972, "mask/share_step_conf": 0.17258381843566895, "num_tokens": 55324797.0, "reward": 0.7346382141113281, "reward_std": 0.20908474922180176, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7833744287490845, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.35543322563171387, "step": 182 }, { "adv/mean_abs_final_conf": 0.599501371383667, "adv/mean_abs_reasoning": 0.5415783524513245, "adv/mean_abs_step_conf": 0.7447316646575928, "adv/ratio_final_to_reasoning": 1.1069522418504505, "adv/ratio_step_to_reasoning": 1.375113427792569, "adv/std_final_conf": 0.8279132843017578, "adv/std_reasoning": 0.7931107878684998, "adv/std_step_conf": 0.9066308736801147, "calib/answer_extract_rate": 0.92578125, "calib/avg_num_step_conf": 17.9296875, "calib/ece": 0.1794050632911392, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6075949367088608, "calib/gap": 0.22172419895678075, "calib/mean_conf": 0.7547215189873417, "calib/mu_c": 0.8117897727272727, "calib/mu_w": 0.590065573770492, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09575527426160335, "calib/std_conf": 0.3328867321729186, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.353248, "calib/step_q_c_n": 2450.0, "calib/step_q_gap": -0.06968190654205603, "calib/step_q_w": 0.42292990654205603, "calib/step_q_w_n": 2140.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 766.8046875, "completions/mean_terminated_length": 828.2784423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.1952, "grad_norm": 1.161199688911438, "kl": 0.207366943359375, "learning_rate": 5.000000000000001e-07, "loss": -0.3313, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01840786077082157, "mask/share_reasoning": 0.7362696528434753, "mask/share_step_conf": 0.17110374569892883, "num_tokens": 55627779.0, "reward": 0.6906335353851318, "reward_std": 0.2569107413291931, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7245738506317139, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.3340369760990143, "step": 183 }, { "adv/mean_abs_final_conf": 0.5056432485580444, "adv/mean_abs_reasoning": 0.4590601623058319, "adv/mean_abs_step_conf": 0.6574715375900269, "adv/ratio_final_to_reasoning": 1.1014749047667924, "adv/ratio_step_to_reasoning": 1.4322121403163943, "adv/std_final_conf": 0.7579997181892395, "adv/std_reasoning": 0.7394355535507202, "adv/std_step_conf": 0.8758077621459961, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 16.91796875, "calib/ece": 0.12669957983193275, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.7605042016806722, "calib/gap": 0.38538515325670486, "calib/mean_conf": 0.832329831932773, "calib/mu_c": 0.9262472222222221, "calib/mu_w": 0.5408620689655173, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10136344537815126, "calib/std_conf": 0.311800230689387, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38726167615947926, "calib/step_q_c_n": 2458.0, "calib/step_q_gap": -0.03298098267661259, "calib/step_q_w": 0.42024265883609185, "calib/step_q_w_n": 1873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 727.18359375, "completions/mean_terminated_length": 782.1807250976562, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.19626666666666667, "grad_norm": 0.7605681419372559, "kl": 0.2161865234375, "learning_rate": 4.7222222222222226e-07, "loss": -0.2007, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.019348185509443283, "mask/share_reasoning": 0.7410755753517151, "mask/share_step_conf": 0.16926376521587372, "num_tokens": 55919218.0, "reward": 0.7029640674591064, "reward_std": 0.22882726788520813, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7946516275405884, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.28471386432647705, "step": 184 }, { "adv/mean_abs_final_conf": 0.543674647808075, "adv/mean_abs_reasoning": 0.4470303952693939, "adv/mean_abs_step_conf": 0.6151069402694702, "adv/ratio_final_to_reasoning": 1.216191680837363, "adv/ratio_step_to_reasoning": 1.375984601447041, "adv/std_final_conf": 0.779508650302887, "adv/std_reasoning": 0.7206876873970032, "adv/std_step_conf": 0.844012439250946, "calib/answer_extract_rate": 0.94921875, "calib/avg_num_step_conf": 15.28515625, "calib/ece": 0.1368724279835391, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.691358024691358, "calib/gap": 0.4064119047619048, "calib/mean_conf": 0.7872427983539095, "calib/mu_c": 0.9126785714285715, "calib/mu_w": 0.5062666666666666, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11637860082304528, "calib/std_conf": 0.3335376082403418, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34429794159885113, "calib/step_q_c_n": 2089.0, "calib/step_q_gap": -0.043456828137990955, "calib/step_q_w": 0.3877547697368421, "calib/step_q_w_n": 1824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 749.9453125, "completions/mean_terminated_length": 790.0657958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.19733333333333333, "grad_norm": 1.293918490409851, "kl": 0.20623779296875, "learning_rate": 4.444444444444445e-07, "loss": -0.2047, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01943805254995823, "mask/share_reasoning": 0.7589957118034363, "mask/share_step_conf": 0.17078498005867004, "num_tokens": 56218124.0, "reward": 0.6847925186157227, "reward_std": 0.23087607324123383, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7969818115234375, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.25150930881500244, "step": 185 }, { "adv/mean_abs_final_conf": 0.4829450845718384, "adv/mean_abs_reasoning": 0.45892712473869324, "adv/mean_abs_step_conf": 0.5779283046722412, "adv/ratio_final_to_reasoning": 1.0523350190878795, "adv/ratio_step_to_reasoning": 1.2593029993624927, "adv/std_final_conf": 0.758141815662384, "adv/std_reasoning": 0.7208572626113892, "adv/std_step_conf": 0.8272896409034729, "calib/answer_extract_rate": 0.8515625, "calib/avg_num_step_conf": 20.10546875, "calib/ece": 0.05591743119266059, "calib/final_conf_rate": 0.8515625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.7385321100917431, "calib/gap": 0.5937821519140198, "calib/mean_conf": 0.8089908256880735, "calib/mu_c": 0.942455621301775, "calib/mu_w": 0.3486734693877551, "calib/nonempty_final_conf_rate": 0.8515625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.044839449541284444, "calib/std_conf": 0.3308272781818732, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37102044943820217, "calib/step_q_c_n": 2225.0, "calib/step_q_gap": -0.0818712343400319, "calib/step_q_w": 0.4528916837782341, "calib/step_q_w_n": 2922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 675.3046875, "completions/mean_terminated_length": 789.397216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.1984, "grad_norm": 1.1111749410629272, "kl": 0.2142333984375, "learning_rate": 4.1666666666666667e-07, "loss": -0.4156, "mask/has_final_conf_rate": 0.8515625, "mask/share_final_conf": 0.017488744109869003, "mask/share_reasoning": 0.6818581819534302, "mask/share_step_conf": 0.15612183511257172, "num_tokens": 56496042.0, "reward": 0.6668155193328857, "reward_std": 0.21297681331634521, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7852226495742798, "rewards/format_reward_step": 0.8515625, "rewards/step_margin_reward": 0.24606461822986603, "step": 186 }, { "adv/mean_abs_final_conf": 0.6033467054367065, "adv/mean_abs_reasoning": 0.5293259620666504, "adv/mean_abs_step_conf": 0.7025371193885803, "adv/ratio_final_to_reasoning": 1.1398396237378128, "adv/ratio_step_to_reasoning": 1.3272296651493545, "adv/std_final_conf": 0.8274721503257751, "adv/std_reasoning": 0.7756878137588501, "adv/std_step_conf": 0.906362771987915, "calib/answer_extract_rate": 0.8984375, "calib/avg_num_step_conf": 19.078125, "calib/ece": 0.16744347826086953, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.5826086956521739, "calib/gap": 0.31883709806005844, "calib/mean_conf": 0.7278782608695651, "calib/mu_c": 0.8263018867924529, "calib/mu_w": 0.5074647887323944, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10200869565217387, "calib/std_conf": 0.3486890267179298, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36826494481236205, "calib/step_q_c_n": 2265.0, "calib/step_q_gap": -0.021178501882326506, "calib/step_q_w": 0.38944344669468856, "calib/step_q_w_n": 2617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 805.78515625, "completions/mean_terminated_length": 892.9913330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.19946666666666665, "grad_norm": 1.214120626449585, "kl": 0.195098876953125, "learning_rate": 3.8888888888888895e-07, "loss": -0.3161, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.01709054782986641, "mask/share_reasoning": 0.7231029272079468, "mask/share_step_conf": 0.16215023398399353, "num_tokens": 56803867.0, "reward": 0.6590248346328735, "reward_std": 0.2382904291152954, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7168227434158325, "rewards/format_reward_step": 0.89453125, "rewards/step_margin_reward": 0.29810184240341187, "step": 187 }, { "adv/mean_abs_final_conf": 0.4895247220993042, "adv/mean_abs_reasoning": 0.4144752323627472, "adv/mean_abs_step_conf": 0.7287554740905762, "adv/ratio_final_to_reasoning": 1.1810711084199936, "adv/ratio_step_to_reasoning": 1.758260608085677, "adv/std_final_conf": 0.7579102516174316, "adv/std_reasoning": 0.7014322876930237, "adv/std_step_conf": 0.9215178489685059, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 17.296875, "calib/ece": 0.10073221757322179, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.7322175732217573, "calib/gap": 0.48930679702048424, "calib/mean_conf": 0.8173849372384938, "calib/mu_c": 0.9402234636871509, "calib/mu_w": 0.4509166666666667, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08458158995815904, "calib/std_conf": 0.32389517769325593, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35254701244813275, "calib/step_q_c_n": 2410.0, "calib/step_q_gap": -0.008747832396712063, "calib/step_q_w": 0.3612948448448448, "calib/step_q_w_n": 1998.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 805.2421875, "completions/mean_terminated_length": 855.3610229492188, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.20053333333333334, "grad_norm": 1.4672449827194214, "kl": 0.20538330078125, "learning_rate": 3.611111111111111e-07, "loss": -0.1776, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018879268318414688, "mask/share_reasoning": 0.7498528361320496, "mask/share_step_conf": 0.17267413437366486, "num_tokens": 57114081.0, "reward": 0.7526510953903198, "reward_std": 0.21485695242881775, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.823620080947876, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.35590076446533203, "step": 188 }, { "adv/mean_abs_final_conf": 0.5508660078048706, "adv/mean_abs_reasoning": 0.41227346658706665, "adv/mean_abs_step_conf": 0.5993289947509766, "adv/ratio_final_to_reasoning": 1.3361665313198978, "adv/ratio_step_to_reasoning": 1.4537171157591493, "adv/std_final_conf": 0.7943575382232666, "adv/std_reasoning": 0.681851327419281, "adv/std_step_conf": 0.8275741338729858, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 15.71875, "calib/ece": 0.17445833333333333, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6166666666666667, "calib/gap": 0.3424080104200586, "calib/mean_conf": 0.7287916666666666, "calib/mu_c": 0.834367469879518, "calib/mu_w": 0.49195945945945946, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10579166666666666, "calib/std_conf": 0.3626002710936965, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37252279635258356, "calib/step_q_c_n": 1974.0, "calib/step_q_gap": -0.027345496330343277, "calib/step_q_w": 0.39986829268292684, "calib/step_q_w_n": 2050.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 697.14453125, "completions/mean_terminated_length": 743.620849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.2016, "grad_norm": 1.2404696941375732, "kl": 0.227996826171875, "learning_rate": 3.3333333333333335e-07, "loss": -0.1732, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02052876353263855, "mask/share_reasoning": 0.7510529160499573, "mask/share_step_conf": 0.16591832041740417, "num_tokens": 57400318.0, "reward": 0.6404236555099487, "reward_std": 0.18784648180007935, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7499300837516785, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.21372976899147034, "step": 189 }, { "adv/mean_abs_final_conf": 0.5000724792480469, "adv/mean_abs_reasoning": 0.46006476879119873, "adv/mean_abs_step_conf": 0.6737987995147705, "adv/ratio_final_to_reasoning": 1.0869610393379323, "adv/ratio_step_to_reasoning": 1.4645737844372417, "adv/std_final_conf": 0.7748420238494873, "adv/std_reasoning": 0.7394383549690247, "adv/std_step_conf": 0.8910828232765198, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 18.03515625, "calib/ece": 0.13979059829059842, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.7478632478632479, "calib/gap": 0.42944550327575937, "calib/mean_conf": 0.8211581196581197, "calib/mu_c": 0.9551304347826087, "calib/mu_w": 0.5256849315068494, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1364572649572651, "calib/std_conf": 0.32836119333558894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3299528905560459, "calib/step_q_c_n": 2266.0, "calib/step_q_gap": -0.0427577006817253, "calib/step_q_w": 0.37271059123777117, "calib/step_q_w_n": 2351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 811.3125, "completions/mean_terminated_length": 880.0678100585938, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.20266666666666666, "grad_norm": 1.3556324243545532, "kl": 0.188079833984375, "learning_rate": 3.055555555555556e-07, "loss": -0.2387, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.017239660024642944, "mask/share_reasoning": 0.7378236055374146, "mask/share_step_conf": 0.1668117344379425, "num_tokens": 57713622.0, "reward": 0.6745651960372925, "reward_std": 0.23061785101890564, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7677172422409058, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.27360066771507263, "step": 190 }, { "adv/mean_abs_final_conf": 0.4767415523529053, "adv/mean_abs_reasoning": 0.4095577001571655, "adv/mean_abs_step_conf": 0.5481799244880676, "adv/ratio_final_to_reasoning": 1.1640400172428897, "adv/ratio_step_to_reasoning": 1.3384681188455414, "adv/std_final_conf": 0.7403095960617065, "adv/std_reasoning": 0.681888997554779, "adv/std_step_conf": 0.7935493588447571, "calib/answer_extract_rate": 0.8984375, "calib/avg_num_step_conf": 18.0859375, "calib/ece": 0.18001739130434788, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.6869565217391305, "calib/gap": 0.3283962217250809, "calib/mean_conf": 0.7876695652173913, "calib/mu_c": 0.9033221476510067, "calib/mu_w": 0.5749259259259258, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15993043478260874, "calib/std_conf": 0.33018642222470335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3850604444444444, "calib/step_q_c_n": 2025.0, "calib/step_q_gap": -0.061241666879931766, "calib/step_q_w": 0.4463021113243762, "calib/step_q_w_n": 2605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 720.7890625, "completions/mean_terminated_length": 795.3534545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.20373333333333332, "grad_norm": 1.053230881690979, "kl": 0.2259521484375, "learning_rate": 2.7777777777777776e-07, "loss": -0.312, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.019232170656323433, "mask/share_reasoning": 0.7079811096191406, "mask/share_step_conf": 0.1790367066860199, "num_tokens": 58002312.0, "reward": 0.6380224227905273, "reward_std": 0.1949125975370407, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7125676870346069, "rewards/format_reward_step": 0.8984375, "rewards/step_margin_reward": 0.2673834264278412, "step": 191 }, { "adv/mean_abs_final_conf": 0.4584586024284363, "adv/mean_abs_reasoning": 0.3656942844390869, "adv/mean_abs_step_conf": 0.6547200679779053, "adv/ratio_final_to_reasoning": 1.2536663052626982, "adv/ratio_step_to_reasoning": 1.7903481017816152, "adv/std_final_conf": 0.7212614417076111, "adv/std_reasoning": 0.6406073570251465, "adv/std_step_conf": 0.843872606754303, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 16.19140625, "calib/ece": 0.07082521551724141, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.6508620689655172, "calib/gap": 0.5387381102756892, "calib/mean_conf": 0.7644459051724138, "calib/mu_c": 0.8968082857142857, "calib/mu_w": 0.3580701754385965, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04048038793103452, "calib/std_conf": 0.34546469530939794, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3580890366972477, "calib/step_q_c_n": 2180.0, "calib/step_q_gap": -0.06594709561827389, "calib/step_q_w": 0.42403613231552156, "calib/step_q_w_n": 1965.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 715.5, "completions/mean_terminated_length": 782.769287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.2048, "grad_norm": 1.5645737648010254, "kl": 0.224853515625, "learning_rate": 2.5000000000000004e-07, "loss": -0.3583, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.020545516163110733, "mask/share_reasoning": 0.7284271121025085, "mask/share_step_conf": 0.16508983075618744, "num_tokens": 58290456.0, "reward": 0.7366434335708618, "reward_std": 0.20213548839092255, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.8110120296478271, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.34430617094039917, "step": 192 }, { "adv/mean_abs_final_conf": 0.5915266275405884, "adv/mean_abs_reasoning": 0.5218731164932251, "adv/mean_abs_step_conf": 0.6205939054489136, "adv/ratio_final_to_reasoning": 1.1334682873021062, "adv/ratio_step_to_reasoning": 1.1891662663504339, "adv/std_final_conf": 0.8266447186470032, "adv/std_reasoning": 0.7755892276763916, "adv/std_step_conf": 0.8439964056015015, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 17.828125, "calib/ece": 0.21960729613733918, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.6437768240343348, "calib/gap": 0.2658452380952381, "calib/mean_conf": 0.7656587982832619, "calib/mu_c": 0.8614999999999999, "calib/mu_w": 0.5956547619047619, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17289055793991426, "calib/std_conf": 0.3415928990606869, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3481242957746478, "calib/step_q_c_n": 1988.0, "calib/step_q_gap": -0.06758511157524866, "calib/step_q_w": 0.4157094073498965, "calib/step_q_w_n": 2576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 748.2890625, "completions/mean_terminated_length": 815.1574096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.20586666666666667, "grad_norm": 1.8033738136291504, "kl": 0.205810546875, "learning_rate": 2.2222222222222224e-07, "loss": -0.2309, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.018781986087560654, "mask/share_reasoning": 0.7305821180343628, "mask/share_step_conf": 0.16860461235046387, "num_tokens": 58587730.0, "reward": 0.6168700456619263, "reward_std": 0.24369192123413086, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6872923374176025, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.24879151582717896, "step": 193 }, { "adv/mean_abs_final_conf": 0.46990489959716797, "adv/mean_abs_reasoning": 0.3474540710449219, "adv/mean_abs_step_conf": 0.5742833614349365, "adv/ratio_final_to_reasoning": 1.3524230646772724, "adv/ratio_step_to_reasoning": 1.6528324440345619, "adv/std_final_conf": 0.7213148474693298, "adv/std_reasoning": 0.6403520107269287, "adv/std_step_conf": 0.8269177675247192, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 17.01953125, "calib/ece": 0.12043750000000003, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.65, "calib/gap": 0.46018681318681304, "calib/mean_conf": 0.7449375000000001, "calib/mu_c": 0.8695714285714284, "calib/mu_w": 0.4093846153846154, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0681041666666667, "calib/std_conf": 0.3680022648396112, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.372512690134724, "calib/step_q_c_n": 2301.0, "calib/step_q_gap": -0.026359391577338243, "calib/step_q_w": 0.39887208171206223, "calib/step_q_w_n": 2056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 732.6328125, "completions/mean_terminated_length": 781.4750366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.20693333333333333, "grad_norm": 1.3977628946304321, "kl": 0.2171630859375, "learning_rate": 1.9444444444444447e-07, "loss": -0.1234, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.019067011773586273, "mask/share_reasoning": 0.7472081184387207, "mask/share_step_conf": 0.17122486233711243, "num_tokens": 58881228.0, "reward": 0.6840992569923401, "reward_std": 0.15268655121326447, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.795563280582428, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.2484164834022522, "step": 194 }, { "adv/mean_abs_final_conf": 0.5955495238304138, "adv/mean_abs_reasoning": 0.48984187841415405, "adv/mean_abs_step_conf": 0.5682377815246582, "adv/ratio_final_to_reasoning": 1.2157995264890062, "adv/ratio_step_to_reasoning": 1.1600432845070499, "adv/std_final_conf": 0.8277072310447693, "adv/std_reasoning": 0.7394471764564514, "adv/std_step_conf": 0.8262815475463867, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 17.23828125, "calib/ece": 0.1139792531120332, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5933609958506224, "calib/gap": 0.44922449762233474, "calib/mean_conf": 0.7133153526970955, "calib/mu_c": 0.8661635220125786, "calib/mu_w": 0.4169390243902439, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08377178423236517, "calib/std_conf": 0.36904471292478735, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38244337016574587, "calib/step_q_c_n": 2172.0, "calib/step_q_gap": 0.005894284935937744, "calib/step_q_w": 0.3765490852298081, "calib/step_q_w_n": 2241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 781.796875, "completions/mean_terminated_length": 830.4564819335938, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.208, "grad_norm": 1.2796424627304077, "kl": 0.21484375, "learning_rate": 1.6666666666666668e-07, "loss": -0.163, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018959684297442436, "mask/share_reasoning": 0.7448201179504395, "mask/share_step_conf": 0.17762643098831177, "num_tokens": 59187352.0, "reward": 0.6303240060806274, "reward_std": 0.20820724964141846, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.789030909538269, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.1591169536113739, "step": 195 }, { "adv/mean_abs_final_conf": 0.37481117248535156, "adv/mean_abs_reasoning": 0.27760109305381775, "adv/mean_abs_step_conf": 0.682806670665741, "adv/ratio_final_to_reasoning": 1.350179022575707, "adv/ratio_step_to_reasoning": 2.4596685234714375, "adv/std_final_conf": 0.644271969795227, "adv/std_reasoning": 0.5726718902587891, "adv/std_step_conf": 0.8913529515266418, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.58203125, "calib/ece": 0.1978167330677291, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7729083665338645, "calib/gap": 0.35664941545480466, "calib/mean_conf": 0.8557450199203187, "calib/mu_c": 0.9751017964071855, "calib/mu_w": 0.6184523809523809, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19411155378486056, "calib/std_conf": 0.2826524457786999, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38418827519379845, "calib/step_q_c_n": 2064.0, "calib/step_q_gap": 0.02467730562550402, "calib/step_q_w": 0.3595109695682944, "calib/step_q_w_n": 1413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 668.58984375, "completions/mean_terminated_length": 681.9083862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 261.0, "epoch": 0.20906666666666668, "grad_norm": 1.3142296075820923, "kl": 0.24029541015625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0081, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.022626737132668495, "mask/share_reasoning": 0.7662988901138306, "mask/share_step_conf": 0.19154314696788788, "num_tokens": 59461055.0, "reward": 0.7016684412956238, "reward_std": 0.1755383312702179, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.8039993047714233, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.27277499437332153, "step": 196 }, { "adv/mean_abs_final_conf": 0.6367141008377075, "adv/mean_abs_reasoning": 0.538066565990448, "adv/mean_abs_step_conf": 0.7363102436065674, "adv/ratio_final_to_reasoning": 1.1833370461620742, "adv/ratio_step_to_reasoning": 1.3684370859415909, "adv/std_final_conf": 0.8445119261741638, "adv/std_reasoning": 0.7930350303649902, "adv/std_step_conf": 0.9213052988052368, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 16.94140625, "calib/ece": 0.19726609442060083, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.648068669527897, "calib/gap": 0.29037785947712436, "calib/mean_conf": 0.7663648068669527, "calib/mu_c": 0.8660653594771243, "calib/mu_w": 0.5756874999999999, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15348927038626609, "calib/std_conf": 0.3433324318520512, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35348792508624943, "calib/step_q_c_n": 2029.0, "calib/step_q_gap": -0.037050419801098966, "calib/step_q_w": 0.3905383448873484, "calib/step_q_w_n": 2308.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 719.6328125, "completions/mean_terminated_length": 783.9404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.21013333333333334, "grad_norm": 1.5192164182662964, "kl": 0.218231201171875, "learning_rate": 1.1111111111111112e-07, "loss": -0.2241, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.018830955028533936, "mask/share_reasoning": 0.7327144145965576, "mask/share_step_conf": 0.16642341017723083, "num_tokens": 59750337.0, "reward": 0.65471351146698, "reward_std": 0.2487749457359314, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7058836817741394, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.30198079347610474, "step": 197 }, { "adv/mean_abs_final_conf": 0.4821930527687073, "adv/mean_abs_reasoning": 0.4194025695323944, "adv/mean_abs_step_conf": 0.7058519124984741, "adv/ratio_final_to_reasoning": 1.1497141119243022, "adv/ratio_step_to_reasoning": 1.6829937720349482, "adv/std_final_conf": 0.740700900554657, "adv/std_reasoning": 0.7016922831535339, "adv/std_step_conf": 0.891240119934082, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 17.49609375, "calib/ece": 0.07405150214592279, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.5836909871244635, "calib/gap": 0.5530430402930404, "calib/mean_conf": 0.7119914163090129, "calib/mu_c": 0.8662738095238095, "calib/mu_w": 0.31323076923076915, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03250643776824039, "calib/std_conf": 0.3618361465111361, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37521608040201004, "calib/step_q_c_n": 2189.0, "calib/step_q_gap": -0.013832784226810946, "calib/step_q_w": 0.389048864628821, "calib/step_q_w_n": 2290.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 718.3984375, "completions/mean_terminated_length": 785.940185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.2112, "grad_norm": 1.0711225271224976, "kl": 0.2114105224609375, "learning_rate": 8.333333333333334e-08, "loss": -0.2661, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.019884463399648666, "mask/share_reasoning": 0.7254239320755005, "mask/share_step_conf": 0.16875414550304413, "num_tokens": 60039631.0, "reward": 0.681763768196106, "reward_std": 0.21229125559329987, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.810340940952301, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.239905446767807, "step": 198 }, { "adv/mean_abs_final_conf": 0.5740118026733398, "adv/mean_abs_reasoning": 0.4569406807422638, "adv/mean_abs_step_conf": 0.6765886545181274, "adv/ratio_final_to_reasoning": 1.2562063892864679, "adv/ratio_step_to_reasoning": 1.4806925341360786, "adv/std_final_conf": 0.8104844689369202, "adv/std_reasoning": 0.7208795547485352, "adv/std_step_conf": 0.8912445902824402, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 17.36328125, "calib/ece": 0.18282838983050834, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6991525423728814, "calib/gap": 0.26619367201426025, "calib/mean_conf": 0.8040148305084746, "calib/mu_c": 0.8784588235294117, "calib/mu_w": 0.6122651515151515, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13325211864406766, "calib/std_conf": 0.3146852725416814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3475078777442962, "calib/step_q_c_n": 2323.0, "calib/step_q_gap": -0.057662951661924366, "calib/step_q_w": 0.40517082940622057, "calib/step_q_w_n": 2122.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 740.796875, "completions/mean_terminated_length": 803.5762939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.21226666666666666, "grad_norm": 1.425162672996521, "kl": 0.22003173828125, "learning_rate": 5.555555555555556e-08, "loss": -0.3178, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01996612176299095, "mask/share_reasoning": 0.7292059659957886, "mask/share_step_conf": 0.17270290851593018, "num_tokens": 60333475.0, "reward": 0.6757093667984009, "reward_std": 0.24863620102405548, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7372885942459106, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.29694265127182007, "step": 199 }, { "adv/mean_abs_final_conf": 0.4264031648635864, "adv/mean_abs_reasoning": 0.29352903366088867, "adv/mean_abs_step_conf": 0.47116726636886597, "adv/ratio_final_to_reasoning": 1.4526779839987003, "adv/ratio_step_to_reasoning": 1.6051811314624538, "adv/std_final_conf": 0.7020553350448608, "adv/std_reasoning": 0.5962840914726257, "adv/std_step_conf": 0.7582021355628967, "calib/answer_extract_rate": 0.90234375, "calib/avg_num_step_conf": 18.41015625, "calib/ece": 0.05049134199134204, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.645021645021645, "calib/gap": 0.6014358986992512, "calib/mean_conf": 0.764517316017316, "calib/mu_c": 0.9181308139534884, "calib/mu_w": 0.31669491525423726, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03520995670995676, "calib/std_conf": 0.34323008848227377, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34493751724137933, "calib/step_q_c_n": 2175.0, "calib/step_q_gap": -0.04065893665932985, "calib/step_q_w": 0.3855964539007092, "calib/step_q_w_n": 2538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 724.07421875, "completions/mean_terminated_length": 802.437255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.21333333333333335, "grad_norm": 1.2463358640670776, "kl": 0.20904541015625, "learning_rate": 2.777777777777778e-08, "loss": -0.287, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.018621867522597313, "mask/share_reasoning": 0.7213960886001587, "mask/share_step_conf": 0.16232578456401825, "num_tokens": 60626886.0, "reward": 0.7203070521354675, "reward_std": 0.15783384442329407, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.8304967880249023, "rewards/format_reward_step": 0.90234375, "rewards/step_margin_reward": 0.2952735424041748, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.13025389663525858, "train_runtime": 14198.202, "train_samples_per_second": 3.606, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 60626886, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }