{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7557821869850159, "adv/mean_abs_reasoning": 0.28040462732315063, "adv/mean_abs_step_conf": 0.7484350204467773, "adv/ratio_final_to_reasoning": 2.69532708571895, "adv/ratio_step_to_reasoning": 2.669125069695258, "adv/std_final_conf": 0.9257818460464478, "adv/std_reasoning": 0.5727222561836243, "adv/std_step_conf": 0.9177737236022949, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.59765625, "calib/ece": 0.23243902439024394, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.008130081300813009, "calib/gap": -0.04614489795918364, "calib/mean_conf": 0.6646341463414636, "calib/mu_c": 0.6552551020408164, "calib/mu_w": 0.7014, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05016260162601624, "calib/std_conf": 0.05917169015101882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583372, "calib/step_q_c_n": 2500.0, "calib/step_q_gap": -0.0778082748585287, "calib/step_q_w": 0.6611802748585287, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 750.2265625, "completions/mean_terminated_length": 780.7235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0010666666666666667, "grad_norm": 0.31607913970947266, "kl": 0.00047022104263305664, "learning_rate": 0.0, "loss": -0.1529, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01929234340786934, "mask/share_reasoning": 0.7498296499252319, "mask/share_step_conf": 0.19181546568870544, "num_tokens": 299642.0, "reward": 0.9853408932685852, "reward_std": 0.09984834492206573, "rewards/accuracy_reward_step": 0.765625, "rewards/asymmetric_l2_reward": 0.854544997215271, "rewards/final_brier_reward_step": 0.7708241939544678, "rewards/format_reward_step": 0.9609375, "step": 1 }, { "adv/mean_abs_final_conf": 0.7929245233535767, "adv/mean_abs_reasoning": 0.4050842523574829, "adv/mean_abs_step_conf": 0.7462632656097412, "adv/ratio_final_to_reasoning": 1.9574311238685933, "adv/ratio_step_to_reasoning": 1.842242104615736, "adv/std_final_conf": 0.9301473498344421, "adv/std_reasoning": 0.6612725853919983, "adv/std_step_conf": 0.9254681468009949, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.078125, "calib/ece": 0.04704724409448811, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.008169981916817282, "calib/mean_conf": 0.6691732283464566, "calib/mu_c": 0.6717142857142857, "calib/mu_w": 0.6635443037974684, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013622047244094477, "calib/std_conf": 0.060200661111313364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5911686697057605, "calib/step_q_c_n": 2413.0, "calib/step_q_gap": -0.011375410898773475, "calib/step_q_w": 0.602544080604534, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 867.8828125, "completions/mean_terminated_length": 871.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.0021333333333333334, "grad_norm": 0.5649791359901428, "kl": 0.0006206929683685303, "learning_rate": 2.5000000000000004e-07, "loss": -0.0445, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01878987066447735, "mask/share_reasoning": 0.7856365442276001, "mask/share_step_conf": 0.1916673481464386, "num_tokens": 625108.0, "reward": 0.9965274333953857, "reward_std": 0.08741441369056702, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.8788357973098755, "rewards/final_brier_reward_step": 0.7790628671646118, "rewards/format_reward_step": 0.9921875, "step": 2 }, { "adv/mean_abs_final_conf": 0.7533313035964966, "adv/mean_abs_reasoning": 0.33080220222473145, "adv/mean_abs_step_conf": 0.7546164989471436, "adv/ratio_final_to_reasoning": 2.2772862409323342, "adv/ratio_step_to_reasoning": 2.2811713279783206, "adv/std_final_conf": 0.9281913638114929, "adv/std_reasoning": 0.6185181140899658, "adv/std_step_conf": 0.9237169027328491, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.2265625, "calib/ece": 0.13274509803921558, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.020459090909090816, "calib/mean_conf": 0.6668627450980392, "calib/mu_c": 0.6624500000000001, "calib/mu_w": 0.6829090909090909, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007647058823529403, "calib/std_conf": 0.053785711203993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5849299610894941, "calib/step_q_c_n": 2570.0, "calib/step_q_gap": -0.015474450675211804, "calib/step_q_w": 0.6004044117647059, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 824.67578125, "completions/mean_terminated_length": 827.9098510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0032, "grad_norm": 0.25716325640678406, "kl": 0.0004647970199584961, "learning_rate": 5.000000000000001e-07, "loss": -0.0462, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01885269209742546, "mask/share_reasoning": 0.7856958508491516, "mask/share_step_conf": 0.1915452480316162, "num_tokens": 941481.0, "reward": 1.0238713026046753, "reward_std": 0.07661885768175125, "rewards/accuracy_reward_step": 0.78125, "rewards/asymmetric_l2_reward": 0.8882023692131042, "rewards/final_brier_reward_step": 0.8040714859962463, "rewards/format_reward_step": 0.99609375, "step": 3 }, { "adv/mean_abs_final_conf": 0.731745719909668, "adv/mean_abs_reasoning": 0.28349608182907104, "adv/mean_abs_step_conf": 0.7188282608985901, "adv/ratio_final_to_reasoning": 2.5811493237880483, "adv/ratio_step_to_reasoning": 2.535584464733431, "adv/std_final_conf": 0.9128077030181885, "adv/std_reasoning": 0.5727400183677673, "adv/std_step_conf": 0.9239922761917114, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.04296875, "calib/ece": 0.10007905138339923, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": -0.015224960254372033, "calib/mean_conf": 0.674308300395257, "calib/mu_c": 0.6702162162162163, "calib/mu_w": 0.6854411764705883, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.021581027667984205, "calib/std_conf": 0.0639788735069172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5918399044205496, "calib/step_q_c_n": 2511.0, "calib/step_q_gap": -0.011398102959524259, "calib/step_q_w": 0.6032380073800738, "calib/step_q_w_n": 1084.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 871.63671875, "completions/mean_terminated_length": 878.5, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.004266666666666667, "grad_norm": 0.25293198227882385, "kl": 0.0004903972148895264, "learning_rate": 7.5e-07, "loss": -0.1005, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018006496131420135, "mask/share_reasoning": 0.7846561074256897, "mask/share_step_conf": 0.18952494859695435, "num_tokens": 1270788.0, "reward": 0.9918568134307861, "reward_std": 0.08511388301849365, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.860637903213501, "rewards/final_brier_reward_step": 0.7808883190155029, "rewards/format_reward_step": 0.98828125, "step": 4 }, { "adv/mean_abs_final_conf": 0.7353215217590332, "adv/mean_abs_reasoning": 0.2619389295578003, "adv/mean_abs_step_conf": 0.7723286151885986, "adv/ratio_final_to_reasoning": 2.807225039059247, "adv/ratio_step_to_reasoning": 2.9485064190054806, "adv/std_final_conf": 0.9268990755081177, "adv/std_reasoning": 0.5482674837112427, "adv/std_step_conf": 0.9180120229721069, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.21484375, "calib/ece": 0.17318897637795277, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.023622047244094488, "calib/gap": -0.05038610545669453, "calib/mean_conf": 0.6758661417322835, "calib/mu_c": 0.6566242038216561, "calib/mu_w": 0.7070103092783506, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11547244094488189, "calib/std_conf": 0.08066552524997236, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5908036622583926, "calib/step_q_c_n": 1966.0, "calib/step_q_gap": -0.02693094622935388, "calib/step_q_w": 0.6177346084877465, "calib/step_q_w_n": 1673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 870.48828125, "completions/mean_terminated_length": 880.810302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.005333333333333333, "grad_norm": 25712.10546875, "kl": 71.50087183713913, "learning_rate": 1.0000000000000002e-06, "loss": 0.899, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019235271960496902, "mask/share_reasoning": 0.7789666652679443, "mask/share_step_conf": 0.19007936120033264, "num_tokens": 1600321.0, "reward": 0.941598117351532, "reward_std": 0.0712629109621048, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8374879956245422, "rewards/final_brier_reward_step": 0.7246144413948059, "rewards/format_reward_step": 0.9921875, "step": 5 }, { "adv/mean_abs_final_conf": 0.7224254608154297, "adv/mean_abs_reasoning": 0.24291057884693146, "adv/mean_abs_step_conf": 0.7474434971809387, "adv/ratio_final_to_reasoning": 2.9740386945875317, "adv/ratio_step_to_reasoning": 3.0770314768873672, "adv/std_final_conf": 0.9267652034759521, "adv/std_reasoning": 0.5481183528900146, "adv/std_step_conf": 0.922061026096344, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.9375, "calib/ece": 0.04160156249999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.014151562910428184, "calib/mean_conf": 0.6590234375, "calib/mu_c": 0.653827160493827, "calib/mu_w": 0.6679787234042552, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.033906249999999985, "calib/std_conf": 0.0552075579126952, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5803855659911022, "calib/step_q_c_n": 2023.0, "calib/step_q_gap": -0.016061292038377917, "calib/step_q_w": 0.5964468580294802, "calib/step_q_w_n": 1289.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 761.17578125, "completions/mean_terminated_length": 767.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.0064, "grad_norm": 0.36427175998687744, "kl": 0.00048065185546875, "learning_rate": 1.25e-06, "loss": -0.0412, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020830316469073296, "mask/share_reasoning": 0.7746731042861938, "mask/share_step_conf": 0.19668403267860413, "num_tokens": 1901134.0, "reward": 0.9762635231018066, "reward_std": 0.051163263618946075, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.8686368465423584, "rewards/final_brier_reward_step": 0.7573277354240417, "rewards/format_reward_step": 1.0, "step": 6 }, { "adv/mean_abs_final_conf": 0.7687034606933594, "adv/mean_abs_reasoning": 0.3801042139530182, "adv/mean_abs_step_conf": 0.7339732050895691, "adv/ratio_final_to_reasoning": 2.022349220228253, "adv/ratio_step_to_reasoning": 1.9309788688117253, "adv/std_final_conf": 0.9293633699417114, "adv/std_reasoning": 0.6614393591880798, "adv/std_step_conf": 0.9263491034507751, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.65234375, "calib/ece": 0.07024096385542175, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.01606425702811245, "calib/gap": -0.0005579710144926198, "calib/mean_conf": 0.6769879518072289, "calib/mu_c": 0.6768333333333334, "calib/mu_w": 0.677391304347826, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012168674698795179, "calib/std_conf": 0.06392267429149058, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6020240782543266, "calib/step_q_c_n": 2658.0, "calib/step_q_gap": -0.025801422116318307, "calib/step_q_w": 0.6278255003706449, "calib/step_q_w_n": 1349.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 920.98828125, "completions/mean_terminated_length": 935.607177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.007466666666666667, "grad_norm": 0.32078471779823303, "kl": 0.0006136298179626465, "learning_rate": 1.5e-06, "loss": -0.1022, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017173487693071365, "mask/share_reasoning": 0.7793081402778625, "mask/share_step_conf": 0.18789339065551758, "num_tokens": 2244331.0, "reward": 0.9820345640182495, "reward_std": 0.12397666275501251, "rewards/accuracy_reward_step": 0.703125, "rewards/asymmetric_l2_reward": 0.8573399186134338, "rewards/final_brier_reward_step": 0.7715730667114258, "rewards/format_reward_step": 0.97265625, "step": 7 }, { "adv/mean_abs_final_conf": 0.7392407655715942, "adv/mean_abs_reasoning": 0.28015512228012085, "adv/mean_abs_step_conf": 0.735325038433075, "adv/ratio_final_to_reasoning": 2.638683739048126, "adv/ratio_step_to_reasoning": 2.6247067426375303, "adv/std_final_conf": 0.91434246301651, "adv/std_reasoning": 0.5726298689842224, "adv/std_step_conf": 0.9170846939086914, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.6484375, "calib/ece": 0.12346456692913389, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.022517995570321303, "calib/mean_conf": 0.6668503937007874, "calib/mu_c": 0.6592261904761905, "calib/mu_w": 0.6817441860465118, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06444881889763784, "calib/std_conf": 0.07923552837293327, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5887942936033134, "calib/step_q_c_n": 2173.0, "calib/step_q_gap": -0.03239784336561502, "calib/step_q_w": 0.6211921369689284, "calib/step_q_w_n": 1577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 867.69921875, "completions/mean_terminated_length": 874.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.008533333333333334, "grad_norm": 0.3923491835594177, "kl": 0.0010648369789123535, "learning_rate": 1.75e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019311347976326942, "mask/share_reasoning": 0.782284140586853, "mask/share_step_conf": 0.1905919909477234, "num_tokens": 2572974.0, "reward": 0.9751387238502502, "reward_std": 0.07677306234836578, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.8668625354766846, "rewards/final_brier_reward_step": 0.7537273168563843, "rewards/format_reward_step": 0.9921875, "step": 8 }, { "adv/mean_abs_final_conf": 0.756470799446106, "adv/mean_abs_reasoning": 0.31696003675460815, "adv/mean_abs_step_conf": 0.7314656972885132, "adv/ratio_final_to_reasoning": 2.3866440930273143, "adv/ratio_step_to_reasoning": 2.3077536991037677, "adv/std_final_conf": 0.9268925189971924, "adv/std_reasoning": 0.6186865568161011, "adv/std_step_conf": 0.9206506609916687, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.41015625, "calib/ece": 0.10880000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.032, "calib/gap": 0.02263290229885062, "calib/mean_conf": 0.67152, "calib/mu_c": 0.6767708333333333, "calib/mu_w": 0.6541379310344827, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.006159999999999996, "calib/std_conf": 0.07376780869729017, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6035622317596566, "calib/step_q_c_n": 2796.0, "calib/step_q_gap": 0.002386419889555791, "calib/step_q_w": 0.6011758118701008, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 839.76953125, "completions/mean_terminated_length": 853.0992431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.0096, "grad_norm": 0.5913500189781189, "kl": 0.0013288259506225586, "learning_rate": 2.0000000000000003e-06, "loss": -0.1152, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018608927726745605, "mask/share_reasoning": 0.7765543460845947, "mask/share_step_conf": 0.1892116665840149, "num_tokens": 2895491.0, "reward": 0.9958125352859497, "reward_std": 0.10686057060956955, "rewards/accuracy_reward_step": 0.75, "rewards/asymmetric_l2_reward": 0.8502780199050903, "rewards/final_brier_reward_step": 0.7960343360900879, "rewards/format_reward_step": 0.9765625, "step": 9 }, { "adv/mean_abs_final_conf": 0.7495448589324951, "adv/mean_abs_reasoning": 0.3233422040939331, "adv/mean_abs_step_conf": 0.7414923906326294, "adv/ratio_final_to_reasoning": 2.3181163777641203, "adv/ratio_step_to_reasoning": 2.293212519876375, "adv/std_final_conf": 0.9266313314437866, "adv/std_reasoning": 0.6185734868049622, "adv/std_step_conf": 0.9230352640151978, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.7890625, "calib/ece": 0.11211155378486055, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00398406374501992, "calib/gap": -0.011110508229336125, "calib/mean_conf": 0.668605577689243, "calib/mu_c": 0.6660824742268041, "calib/mu_w": 0.6771929824561402, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003904382470119522, "calib/std_conf": 0.06158633211639363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5938714390065741, "calib/step_q_c_n": 2738.0, "calib/step_q_gap": -0.031252606794952564, "calib/step_q_w": 0.6251240458015267, "calib/step_q_w_n": 1048.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 850.67578125, "completions/mean_terminated_length": 864.1786499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 316.0, "epoch": 0.010666666666666666, "grad_norm": 0.5259931683540344, "kl": 0.001390993595123291, "learning_rate": 2.25e-06, "loss": -0.1098, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018846701830625534, "mask/share_reasoning": 0.7782702445983887, "mask/share_step_conf": 0.1872580647468567, "num_tokens": 3220064.0, "reward": 0.996660053730011, "reward_std": 0.09596420079469681, "rewards/accuracy_reward_step": 0.7578125, "rewards/asymmetric_l2_reward": 0.8554973602294922, "rewards/final_brier_reward_step": 0.7901663780212402, "rewards/format_reward_step": 0.98046875, "step": 10 }, { "adv/mean_abs_final_conf": 0.7603298425674438, "adv/mean_abs_reasoning": 0.2598026990890503, "adv/mean_abs_step_conf": 0.7337856292724609, "adv/ratio_final_to_reasoning": 2.9265663722255337, "adv/ratio_step_to_reasoning": 2.8243957119973864, "adv/std_final_conf": 0.9253048896789551, "adv/std_reasoning": 0.5483741164207458, "adv/std_step_conf": 0.919272780418396, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.17578125, "calib/ece": 0.17549407114624513, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": -0.05510143702451409, "calib/mean_conf": 0.6753359683794466, "calib/mu_c": 0.6570414201183431, "calib/mu_w": 0.7121428571428572, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0914229249011858, "calib/std_conf": 0.0670760421137329, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5807924874026569, "calib/step_q_c_n": 2183.0, "calib/step_q_gap": -0.04526779461849473, "calib/step_q_w": 0.6260602820211516, "calib/step_q_w_n": 1702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 860.28125, "completions/mean_terminated_length": 870.4822387695312, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.011733333333333333, "grad_norm": 0.6853123307228088, "kl": 0.0028656721115112305, "learning_rate": 2.5e-06, "loss": -0.0202, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01876617595553398, "mask/share_reasoning": 0.776242196559906, "mask/share_step_conf": 0.1932728886604309, "num_tokens": 3544776.0, "reward": 0.9628890752792358, "reward_std": 0.08348000049591064, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.8556461334228516, "rewards/final_brier_reward_step": 0.7404444813728333, "rewards/format_reward_step": 0.98828125, "step": 11 }, { "adv/mean_abs_final_conf": 0.7743625640869141, "adv/mean_abs_reasoning": 0.44604480266571045, "adv/mean_abs_step_conf": 0.7665374875068665, "adv/ratio_final_to_reasoning": 1.7360645375959292, "adv/ratio_step_to_reasoning": 1.7185212851394889, "adv/std_final_conf": 0.9307608008384705, "adv/std_reasoning": 0.7015324831008911, "adv/std_step_conf": 0.9288511872291565, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 14.80078125, "calib/ece": 0.09712500000000002, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.008333333333333333, "calib/gap": -0.010127840909090913, "calib/mean_conf": 0.6617916666666666, "calib/mu_c": 0.6590909090909091, "calib/mu_w": 0.66921875, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012791666666666665, "calib/std_conf": 0.05988633620358562, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5855131052865393, "calib/step_q_c_n": 2251.0, "calib/step_q_gap": -0.08256881929083393, "calib/step_q_w": 0.6680819245773733, "calib/step_q_w_n": 1538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 709.53515625, "completions/mean_terminated_length": 760.004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.0128, "grad_norm": 0.47538426518440247, "kl": 0.0029186010360717773, "learning_rate": 2.7500000000000004e-06, "loss": -0.2091, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.02002115733921528, "mask/share_reasoning": 0.7248589396476746, "mask/share_step_conf": 0.1887136548757553, "num_tokens": 3830593.0, "reward": 0.9528301358222961, "reward_std": 0.13713400065898895, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.8383677005767822, "rewards/final_brier_reward_step": 0.7422925233840942, "rewards/format_reward_step": 0.9375, "step": 12 }, { "adv/mean_abs_final_conf": 0.7659485340118408, "adv/mean_abs_reasoning": 0.23081842064857483, "adv/mean_abs_step_conf": 0.7453242540359497, "adv/ratio_final_to_reasoning": 3.3184029760692764, "adv/ratio_step_to_reasoning": 3.2290501422792386, "adv/std_final_conf": 0.928123414516449, "adv/std_reasoning": 0.496012419462204, "adv/std_step_conf": 0.9245219826698303, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.63671875, "calib/ece": 0.09760509803921562, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0015764165134548591, "calib/mean_conf": 0.6592576470588236, "calib/mu_c": 0.659640932642487, "calib/mu_w": 0.6580645161290322, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.06335861092840808, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5887108586830959, "calib/step_q_c_n": 2597.0, "calib/step_q_gap": 0.001652693134997385, "calib/step_q_w": 0.5870581655480985, "calib/step_q_w_n": 894.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 791.09765625, "completions/mean_terminated_length": 797.3267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.013866666666666666, "grad_norm": 0.23032566905021667, "kl": 0.0032243728637695312, "learning_rate": 3e-06, "loss": 0.0104, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01999080926179886, "mask/share_reasoning": 0.7759867906570435, "mask/share_step_conf": 0.19620992243289948, "num_tokens": 4137706.0, "reward": 1.0154926776885986, "reward_std": 0.058850668370723724, "rewards/accuracy_reward_step": 0.75390625, "rewards/asymmetric_l2_reward": 0.8811044692993164, "rewards/final_brier_reward_step": 0.7998808026313782, "rewards/format_reward_step": 0.99609375, "step": 13 }, { "adv/mean_abs_final_conf": 0.7976450324058533, "adv/mean_abs_reasoning": 0.3876098096370697, "adv/mean_abs_step_conf": 0.771583080291748, "adv/ratio_final_to_reasoning": 2.0578556387742393, "adv/ratio_step_to_reasoning": 1.9906180419277923, "adv/std_final_conf": 0.9286239743232727, "adv/std_reasoning": 0.6404783725738525, "adv/std_step_conf": 0.9233399033546448, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 14.9609375, "calib/ece": 0.15670731707317068, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.016260162601626018, "calib/gap": -0.021074511046005218, "calib/mean_conf": 0.6661382113821138, "calib/mu_c": 0.6598843930635838, "calib/mu_w": 0.680958904109589, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05979674796747966, "calib/std_conf": 0.0657115927210645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5883779044278825, "calib/step_q_c_n": 2281.0, "calib/step_q_gap": -0.05567632410665602, "calib/step_q_w": 0.6440542285345385, "calib/step_q_w_n": 1549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 824.8125, "completions/mean_terminated_length": 858.3414306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.014933333333333333, "grad_norm": 6.704521179199219, "kl": 0.028210163116455078, "learning_rate": 3.2500000000000002e-06, "loss": -0.149, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01951184868812561, "mask/share_reasoning": 0.7569153904914856, "mask/share_step_conf": 0.1845102608203888, "num_tokens": 4454258.0, "reward": 0.959224283695221, "reward_std": 0.1080828532576561, "rewards/accuracy_reward_step": 0.67578125, "rewards/asymmetric_l2_reward": 0.8446293473243713, "rewards/final_brier_reward_step": 0.7464753985404968, "rewards/format_reward_step": 0.9609375, "step": 14 }, { "adv/mean_abs_final_conf": 0.7762026786804199, "adv/mean_abs_reasoning": 0.34601423144340515, "adv/mean_abs_step_conf": 0.734488844871521, "adv/ratio_final_to_reasoning": 2.243268074386638, "adv/ratio_step_to_reasoning": 2.12271281966521, "adv/std_final_conf": 0.9263595342636108, "adv/std_reasoning": 0.6187130808830261, "adv/std_step_conf": 0.9260103702545166, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.0078125, "calib/ece": 0.11416666666666668, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.022878492527615313, "calib/mean_conf": 0.6680555555555556, "calib/mu_c": 0.6607017543859649, "calib/mu_w": 0.6835802469135802, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05182539682539683, "calib/std_conf": 0.0655243569829166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.589272646140116, "calib/step_q_c_n": 2241.0, "calib/step_q_gap": -0.023069361294828283, "calib/step_q_w": 0.6123420074349443, "calib/step_q_w_n": 1345.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 783.28125, "completions/mean_terminated_length": 792.5692138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.016, "grad_norm": 1.0809299945831299, "kl": 0.01723623275756836, "learning_rate": 3.5e-06, "loss": -0.0751, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020053813233971596, "mask/share_reasoning": 0.7671903371810913, "mask/share_step_conf": 0.20103709399700165, "num_tokens": 4762658.0, "reward": 0.962548017501831, "reward_std": 0.10276056826114655, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8391159772872925, "rewards/final_brier_reward_step": 0.7555113434791565, "rewards/format_reward_step": 0.984375, "step": 15 }, { "adv/mean_abs_final_conf": 0.7713512182235718, "adv/mean_abs_reasoning": 0.29494601488113403, "adv/mean_abs_step_conf": 0.758004903793335, "adv/ratio_final_to_reasoning": 2.6152284801489296, "adv/ratio_step_to_reasoning": 2.569978455544883, "adv/std_final_conf": 0.9274030327796936, "adv/std_reasoning": 0.5727872252464294, "adv/std_step_conf": 0.9221407175064087, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 16.08984375, "calib/ece": 0.13076612903225798, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.008064516129032258, "calib/gap": -0.01582013047530284, "calib/mean_conf": 0.6909274193548387, "calib/mu_c": 0.6862068965517242, "calib/mu_w": 0.702027027027027, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06004032258064514, "calib/std_conf": 0.0636233464690511, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5987816091954024, "calib/step_q_c_n": 2610.0, "calib/step_q_gap": -0.02624158497292106, "calib/step_q_w": 0.6250231941683234, "calib/step_q_w_n": 1509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 980.61328125, "completions/mean_terminated_length": 1008.1806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.017066666666666667, "grad_norm": 0.32864105701446533, "kl": 0.007008552551269531, "learning_rate": 3.7500000000000005e-06, "loss": -0.1074, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01608041673898697, "mask/share_reasoning": 0.7731291055679321, "mask/share_step_conf": 0.1834467351436615, "num_tokens": 5122543.0, "reward": 0.9711743593215942, "reward_std": 0.09810502827167511, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.8571702241897583, "rewards/final_brier_reward_step": 0.7554910182952881, "rewards/format_reward_step": 0.96875, "step": 16 }, { "adv/mean_abs_final_conf": 0.7650580406188965, "adv/mean_abs_reasoning": 0.31295207142829895, "adv/mean_abs_step_conf": 0.7583704590797424, "adv/ratio_final_to_reasoning": 2.4446492305585537, "adv/ratio_step_to_reasoning": 2.4232798831417677, "adv/std_final_conf": 0.9286947846412659, "adv/std_reasoning": 0.5960761904716492, "adv/std_step_conf": 0.9244705438613892, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.1796875, "calib/ece": 0.11960474308300398, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.02766798418972332, "calib/gap": -0.018812003968253888, "calib/mean_conf": 0.6807905138339921, "calib/mu_c": 0.6760317460317461, "calib/mu_w": 0.69484375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.026679841897233214, "calib/std_conf": 0.08426013496782013, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6054649610678532, "calib/step_q_c_n": 2697.0, "calib/step_q_gap": -0.021350261808513515, "calib/step_q_w": 0.6268152228763667, "calib/step_q_w_n": 1189.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 825.765625, "completions/mean_terminated_length": 835.557373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.018133333333333335, "grad_norm": 0.22838030755519867, "kl": 0.009852409362792969, "learning_rate": 4.000000000000001e-06, "loss": -0.0646, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02119205705821514, "mask/share_reasoning": 0.7587161064147949, "mask/share_step_conf": 0.2083730548620224, "num_tokens": 5437467.0, "reward": 0.9974074959754944, "reward_std": 0.0782276839017868, "rewards/accuracy_reward_step": 0.73828125, "rewards/asymmetric_l2_reward": 0.8663601875305176, "rewards/final_brier_reward_step": 0.7831422090530396, "rewards/format_reward_step": 0.98828125, "step": 17 }, { "adv/mean_abs_final_conf": 0.7350620627403259, "adv/mean_abs_reasoning": 0.3131064176559448, "adv/mean_abs_step_conf": 0.7618484497070312, "adv/ratio_final_to_reasoning": 2.347642914007737, "adv/ratio_step_to_reasoning": 2.4331933385798052, "adv/std_final_conf": 0.9270232319831848, "adv/std_reasoning": 0.6184937357902527, "adv/std_step_conf": 0.9262146949768066, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.4375, "calib/ece": 0.14473469387755092, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.00816326530612245, "calib/gap": -0.039506916555109295, "calib/mean_conf": 0.6713469387755103, "calib/mu_c": 0.657962962962963, "calib/mu_w": 0.6974698795180723, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07742857142857142, "calib/std_conf": 0.07316895489219906, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5883053221288516, "calib/step_q_c_n": 2142.0, "calib/step_q_gap": -0.05752622483247449, "calib/step_q_w": 0.645831546961326, "calib/step_q_w_n": 1810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 823.45703125, "completions/mean_terminated_length": 853.4615478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 281.0, "epoch": 0.0192, "grad_norm": 0.3023185729980469, "kl": 0.012590408325195312, "learning_rate": 4.25e-06, "loss": -0.1426, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019104354083538055, "mask/share_reasoning": 0.757148027420044, "mask/share_step_conf": 0.18859142065048218, "num_tokens": 5758992.0, "reward": 0.9279800653457642, "reward_std": 0.08618418872356415, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.8175023794174194, "rewards/final_brier_reward_step": 0.7204890251159668, "rewards/format_reward_step": 0.95703125, "step": 18 }, { "adv/mean_abs_final_conf": 0.7569583058357239, "adv/mean_abs_reasoning": 0.26778334379196167, "adv/mean_abs_step_conf": 0.754361629486084, "adv/ratio_final_to_reasoning": 2.826756493203691, "adv/ratio_step_to_reasoning": 2.8170595631673803, "adv/std_final_conf": 0.927824854850769, "adv/std_reasoning": 0.548314094543457, "adv/std_step_conf": 0.9214946627616882, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.6640625, "calib/ece": 0.08305882352941184, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.00786214625945647, "calib/mean_conf": 0.6467450980392156, "calib/mu_c": 0.6441860465116278, "calib/mu_w": 0.6520481927710843, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027647058823529427, "calib/std_conf": 0.04172733299778498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.575268245529241, "calib/step_q_c_n": 2069.0, "calib/step_q_gap": -0.018661848247400137, "calib/step_q_w": 0.5939300937766412, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 688.62890625, "completions/mean_terminated_length": 694.0512084960938, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.020266666666666665, "grad_norm": 0.31743717193603516, "kl": 0.020746231079101562, "learning_rate": 4.5e-06, "loss": -0.016, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022184943780303, "mask/share_reasoning": 0.7688775062561035, "mask/share_step_conf": 0.20112502574920654, "num_tokens": 6040041.0, "reward": 0.9946879148483276, "reward_std": 0.05926477164030075, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8843179941177368, "rewards/final_brier_reward_step": 0.7714640498161316, "rewards/format_reward_step": 0.99609375, "step": 19 }, { "adv/mean_abs_final_conf": 0.7118607759475708, "adv/mean_abs_reasoning": 0.23226484656333923, "adv/mean_abs_step_conf": 0.7482800483703613, "adv/ratio_final_to_reasoning": 3.0648666230834225, "adv/ratio_step_to_reasoning": 3.2216672451390673, "adv/std_final_conf": 0.9108331799507141, "adv/std_reasoning": 0.5481703281402588, "adv/std_step_conf": 0.920397162437439, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.29296875, "calib/ece": 0.20247999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.032, "calib/gap": -0.045271739130434696, "calib/mean_conf": 0.6616799999999999, "calib/mu_c": 0.6497282608695653, "calib/mu_w": 0.695, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06408000000000001, "calib/std_conf": 0.08062988031741086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5902098501070663, "calib/step_q_c_n": 2335.0, "calib/step_q_gap": -0.046074137808341575, "calib/step_q_w": 0.6362839879154079, "calib/step_q_w_n": 1324.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 750.6953125, "completions/mean_terminated_length": 768.7120361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.021333333333333333, "grad_norm": 0.7706186175346375, "kl": 0.017885208129882812, "learning_rate": 4.75e-06, "loss": -0.1237, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.022132711485028267, "mask/share_reasoning": 0.755659282207489, "mask/share_step_conf": 0.19877052307128906, "num_tokens": 6337091.0, "reward": 0.983320951461792, "reward_std": 0.08180738985538483, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.8696904182434082, "rewards/final_brier_reward_step": 0.7578890323638916, "rewards/format_reward_step": 0.9765625, "step": 20 }, { "adv/mean_abs_final_conf": 0.7302876710891724, "adv/mean_abs_reasoning": 0.34584808349609375, "adv/mean_abs_step_conf": 0.7366833686828613, "adv/ratio_final_to_reasoning": 2.111585132139154, "adv/ratio_step_to_reasoning": 2.1300779268050563, "adv/std_final_conf": 0.9276742339134216, "adv/std_reasoning": 0.6403311491012573, "adv/std_step_conf": 0.919529378414154, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.9921875, "calib/ece": 0.1839043824701195, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.027888446215139442, "calib/gap": -0.05810960706899804, "calib/mean_conf": 0.6534661354581672, "calib/mu_c": 0.6409644670050761, "calib/mu_w": 0.6990740740740742, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.026254980079681276, "calib/std_conf": 0.07655647765878142, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5809648405560098, "calib/step_q_c_n": 2446.0, "calib/step_q_gap": -0.07105100451441271, "calib/step_q_w": 0.6520158450704225, "calib/step_q_w_n": 1136.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 718.05078125, "completions/mean_terminated_length": 723.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.0224, "grad_norm": 0.30620530247688293, "kl": 0.024244308471679688, "learning_rate": 5e-06, "loss": -0.1036, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02390342392027378, "mask/share_reasoning": 0.7633707523345947, "mask/share_step_conf": 0.20491333305835724, "num_tokens": 6623872.0, "reward": 0.9959531426429749, "reward_std": 0.09705215692520142, "rewards/accuracy_reward_step": 0.76953125, "rewards/asymmetric_l2_reward": 0.8689085245132446, "rewards/final_brier_reward_step": 0.7729976177215576, "rewards/format_reward_step": 0.98046875, "step": 21 }, { "adv/mean_abs_final_conf": 0.761996865272522, "adv/mean_abs_reasoning": 0.22527900338172913, "adv/mean_abs_step_conf": 0.7466142177581787, "adv/ratio_final_to_reasoning": 3.3824584352467997, "adv/ratio_step_to_reasoning": 3.31417578447407, "adv/std_final_conf": 0.9259976148605347, "adv/std_reasoning": 0.4959462583065033, "adv/std_step_conf": 0.9223623871803284, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.6484375, "calib/ece": 0.11457031250000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0012986365147987788, "calib/mean_conf": 0.6432421875, "calib/mu_c": 0.6435567010309278, "calib/mu_w": 0.642258064516129, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.051728293468998605, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5789115646258504, "calib/step_q_c_n": 2205.0, "calib/step_q_gap": 0.0018459275602132896, "calib/step_q_w": 0.5770656370656371, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 666.578125, "completions/mean_terminated_length": 671.8267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.023466666666666667, "grad_norm": 0.219764843583107, "kl": 0.02294158935546875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0562, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023637905716896057, "mask/share_reasoning": 0.7702509164810181, "mask/share_step_conf": 0.19829866290092468, "num_tokens": 6896332.0, "reward": 1.022200107574463, "reward_std": 0.04603898897767067, "rewards/accuracy_reward_step": 0.7578125, "rewards/asymmetric_l2_reward": 0.8916957378387451, "rewards/final_brier_reward_step": 0.8011417984962463, "rewards/format_reward_step": 1.0, "step": 22 }, { "adv/mean_abs_final_conf": 0.7349973320960999, "adv/mean_abs_reasoning": 0.3252660632133484, "adv/mean_abs_step_conf": 0.7337431907653809, "adv/ratio_final_to_reasoning": 2.2596803516326287, "adv/ratio_step_to_reasoning": 2.25582461175516, "adv/std_final_conf": 0.9285386800765991, "adv/std_reasoning": 0.6185926795005798, "adv/std_step_conf": 0.9233041405677795, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.7421875, "calib/ece": 0.14608695652173917, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.014205893608878806, "calib/mean_conf": 0.6579446640316204, "calib/mu_c": 0.6550248756218905, "calib/mu_w": 0.6692307692307693, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004782608695652205, "calib/std_conf": 0.07007424705257945, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5922569832402235, "calib/step_q_c_n": 2685.0, "calib/step_q_gap": -0.014189715439248296, "calib/step_q_w": 0.6064466986794718, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 771.00390625, "completions/mean_terminated_length": 780.1463012695312, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.024533333333333334, "grad_norm": 0.20996451377868652, "kl": 0.024740219116210938, "learning_rate": 4.944444444444445e-06, "loss": -0.0632, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.022582894191145897, "mask/share_reasoning": 0.7702085971832275, "mask/share_step_conf": 0.1954898238182068, "num_tokens": 7197645.0, "reward": 1.0250046253204346, "reward_std": 0.08676713705062866, "rewards/accuracy_reward_step": 0.78515625, "rewards/asymmetric_l2_reward": 0.8962739109992981, "rewards/final_brier_reward_step": 0.7990476489067078, "rewards/format_reward_step": 0.98828125, "step": 23 }, { "adv/mean_abs_final_conf": 0.7553825378417969, "adv/mean_abs_reasoning": 0.5074846148490906, "adv/mean_abs_step_conf": 0.7459325790405273, "adv/ratio_final_to_reasoning": 1.4884836224373483, "adv/ratio_step_to_reasoning": 1.469862449450499, "adv/std_final_conf": 0.9313974976539612, "adv/std_reasoning": 0.7576209306716919, "adv/std_step_conf": 0.9261898994445801, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.05078125, "calib/ece": 0.08631578947368425, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.008097165991902834, "calib/gap": -0.012404352154743337, "calib/mean_conf": 0.6575708502024291, "calib/mu_c": 0.6531012658227848, "calib/mu_w": 0.6655056179775282, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05210526315789478, "calib/std_conf": 0.06595123060493638, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.585268817204301, "calib/step_q_c_n": 1953.0, "calib/step_q_gap": -0.04514480809983534, "calib/step_q_w": 0.6304136253041364, "calib/step_q_w_n": 1644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 740.19921875, "completions/mean_terminated_length": 767.1700439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.0256, "grad_norm": 0.3021528422832489, "kl": 0.026874542236328125, "learning_rate": 4.9166666666666665e-06, "loss": -0.1693, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02219841256737709, "mask/share_reasoning": 0.749724805355072, "mask/share_step_conf": 0.1929205060005188, "num_tokens": 7491648.0, "reward": 0.9377257823944092, "reward_std": 0.12792997062206268, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8266116976737976, "rewards/final_brier_reward_step": 0.732433557510376, "rewards/format_reward_step": 0.96484375, "step": 24 }, { "adv/mean_abs_final_conf": 0.7360978126525879, "adv/mean_abs_reasoning": 0.24087727069854736, "adv/mean_abs_step_conf": 0.7380087971687317, "adv/ratio_final_to_reasoning": 3.0559039901020726, "adv/ratio_step_to_reasoning": 3.0638374265388184, "adv/std_final_conf": 0.9257768988609314, "adv/std_reasoning": 0.5481476783752441, "adv/std_step_conf": 0.9245792031288147, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.23046875, "calib/ece": 0.13480158730158734, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.03190109890109882, "calib/mean_conf": 0.6556746031746032, "calib/mu_c": 0.6468131868131868, "calib/mu_w": 0.6787142857142856, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.034126984126984124, "calib/std_conf": 0.05825454943267895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.577986921999066, "calib/step_q_c_n": 2141.0, "calib/step_q_gap": -0.042414362110083315, "calib/step_q_w": 0.6204012841091493, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 705.66796875, "completions/mean_terminated_length": 719.72509765625, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.02666666666666667, "grad_norm": 0.44875988364219666, "kl": 0.0285491943359375, "learning_rate": 4.888888888888889e-06, "loss": -0.0343, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0222710520029068, "mask/share_reasoning": 0.7567205429077148, "mask/share_step_conf": 0.20147714018821716, "num_tokens": 7775523.0, "reward": 0.9849003553390503, "reward_std": 0.059770457446575165, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.8641456961631775, "rewards/final_brier_reward_step": 0.7665925621986389, "rewards/format_reward_step": 0.984375, "step": 25 }, { "adv/mean_abs_final_conf": 0.7698163390159607, "adv/mean_abs_reasoning": 0.343593031167984, "adv/mean_abs_step_conf": 0.7416131496429443, "adv/ratio_final_to_reasoning": 2.24048880269517, "adv/ratio_step_to_reasoning": 2.1584056787239283, "adv/std_final_conf": 0.9269220232963562, "adv/std_reasoning": 0.6186467409133911, "adv/std_step_conf": 0.9251113533973694, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.70703125, "calib/ece": 0.1145238095238094, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.02883074162679422, "calib/mean_conf": 0.6492063492063492, "calib/mu_c": 0.6405113636363636, "calib/mu_w": 0.6693421052631578, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03265873015873014, "calib/std_conf": 0.052728763491761484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5708939096267192, "calib/step_q_c_n": 2036.0, "calib/step_q_gap": -0.03748735578001883, "calib/step_q_w": 0.608381265406738, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 699.59765625, "completions/mean_terminated_length": 710.702392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.027733333333333332, "grad_norm": 0.2601587474346161, "kl": 0.0289459228515625, "learning_rate": 4.861111111111111e-06, "loss": -0.056, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02116210199892521, "mask/share_reasoning": 0.7684984803199768, "mask/share_step_conf": 0.19471439719200134, "num_tokens": 8059860.0, "reward": 0.9714474678039551, "reward_std": 0.09195204079151154, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.8485621213912964, "rewards/final_brier_reward_step": 0.7599577903747559, "rewards/format_reward_step": 0.984375, "step": 26 }, { "adv/mean_abs_final_conf": 0.765433132648468, "adv/mean_abs_reasoning": 0.3761621415615082, "adv/mean_abs_step_conf": 0.7535364031791687, "adv/ratio_final_to_reasoning": 2.0348489336833175, "adv/ratio_step_to_reasoning": 2.0032223339943798, "adv/std_final_conf": 0.9282750487327576, "adv/std_reasoning": 0.6403613686561584, "adv/std_step_conf": 0.9224004745483398, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.1328125, "calib/ece": 0.07011857707509882, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.002996894409937778, "calib/mean_conf": 0.6546245059288538, "calib/mu_c": 0.6557142857142857, "calib/mu_w": 0.6527173913043479, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.044189723320158115, "calib/std_conf": 0.059393819565856404, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5819625668449198, "calib/step_q_c_n": 1870.0, "calib/step_q_gap": -7.62681065364923e-05, "calib/step_q_w": 0.5820388349514563, "calib/step_q_w_n": 1236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 775.734375, "completions/mean_terminated_length": 784.932861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0288, "grad_norm": 0.2557304799556732, "kl": 0.028514862060546875, "learning_rate": 4.833333333333333e-06, "loss": -0.0815, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020870499312877655, "mask/share_reasoning": 0.7801257371902466, "mask/share_step_conf": 0.18728497624397278, "num_tokens": 8363664.0, "reward": 0.9708747267723083, "reward_std": 0.088234081864357, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.8611689805984497, "rewards/final_brier_reward_step": 0.757142961025238, "rewards/format_reward_step": 0.98828125, "step": 27 }, { "adv/mean_abs_final_conf": 0.7319560647010803, "adv/mean_abs_reasoning": 0.26165369153022766, "adv/mean_abs_step_conf": 0.7297533750534058, "adv/ratio_final_to_reasoning": 2.79742303814017, "adv/ratio_step_to_reasoning": 2.7890046984836854, "adv/std_final_conf": 0.9239341616630554, "adv/std_reasoning": 0.5725747346878052, "adv/std_step_conf": 0.9209737777709961, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.875, "calib/ece": 0.17386956521739136, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.02766798418972332, "calib/gap": -0.052451086956521786, "calib/mean_conf": 0.6618537549407115, "calib/mu_c": 0.6475489130434783, "calib/mu_w": 0.7000000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.054225296442687754, "calib/std_conf": 0.07876395889668247, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.58572350022553, "calib/step_q_c_n": 2217.0, "calib/step_q_gap": -0.04537762336997564, "calib/step_q_w": 0.6311011235955056, "calib/step_q_w_n": 1335.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 814.4296875, "completions/mean_terminated_length": 827.357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.029866666666666666, "grad_norm": 0.1882183700799942, "kl": 0.02552032470703125, "learning_rate": 4.805555555555556e-06, "loss": -0.1039, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020352616906166077, "mask/share_reasoning": 0.779977560043335, "mask/share_step_conf": 0.18404483795166016, "num_tokens": 8679102.0, "reward": 0.9769738912582397, "reward_std": 0.08208565413951874, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.8545345664024353, "rewards/final_brier_reward_step": 0.7587882280349731, "rewards/format_reward_step": 0.984375, "step": 28 }, { "adv/mean_abs_final_conf": 0.7318214774131775, "adv/mean_abs_reasoning": 0.39057785272598267, "adv/mean_abs_step_conf": 0.7644947171211243, "adv/ratio_final_to_reasoning": 1.8736891308750185, "adv/ratio_step_to_reasoning": 1.957342721266559, "adv/std_final_conf": 0.9285926222801208, "adv/std_reasoning": 0.6815193295478821, "adv/std_step_conf": 0.9241484999656677, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.15234375, "calib/ece": 0.07621513944223113, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": -0.019785104503974082, "calib/mean_conf": 0.6680876494023904, "calib/mu_c": 0.661860465116279, "calib/mu_w": 0.6816455696202531, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02952191235059763, "calib/std_conf": 0.061814594639218866, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5862065972222221, "calib/step_q_c_n": 2304.0, "calib/step_q_gap": -0.02512016547679219, "calib/step_q_w": 0.6113267626990143, "calib/step_q_w_n": 1319.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 868.94921875, "completions/mean_terminated_length": 882.7421264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.030933333333333334, "grad_norm": 0.28926676511764526, "kl": 0.032993316650390625, "learning_rate": 4.777777777777778e-06, "loss": -0.0906, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018677806481719017, "mask/share_reasoning": 0.7811988592147827, "mask/share_step_conf": 0.18449833989143372, "num_tokens": 9008681.0, "reward": 0.973637580871582, "reward_std": 0.09528940171003342, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8602075576782227, "rewards/final_brier_reward_step": 0.7565988302230835, "rewards/format_reward_step": 0.98046875, "step": 29 }, { "adv/mean_abs_final_conf": 0.7216935753822327, "adv/mean_abs_reasoning": 0.3983932435512543, "adv/mean_abs_step_conf": 0.7292443513870239, "adv/ratio_final_to_reasoning": 1.8115105792184074, "adv/ratio_step_to_reasoning": 1.8304636516588033, "adv/std_final_conf": 0.9295569658279419, "adv/std_reasoning": 0.7012472748756409, "adv/std_step_conf": 0.9283719658851624, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.0390625, "calib/ece": 0.0574206349206349, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": 0.0029273255813953014, "calib/mean_conf": 0.6683730158730159, "calib/mu_c": 0.6693023255813954, "calib/mu_w": 0.666375, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.021626984126984126, "calib/std_conf": 0.06938599581121564, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5917662938681064, "calib/step_q_c_n": 2593.0, "calib/step_q_gap": -0.026801725224972373, "calib/step_q_w": 0.6185680190930788, "calib/step_q_w_n": 1257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 868.40625, "completions/mean_terminated_length": 882.1905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.032, "grad_norm": 0.37461334466934204, "kl": 0.038707733154296875, "learning_rate": 4.75e-06, "loss": -0.0406, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018724016845226288, "mask/share_reasoning": 0.7817549705505371, "mask/share_step_conf": 0.1838960349559784, "num_tokens": 9337977.0, "reward": 0.9659197330474854, "reward_std": 0.10380983352661133, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8331959247589111, "rewards/final_brier_reward_step": 0.7673933506011963, "rewards/format_reward_step": 0.984375, "step": 30 }, { "adv/mean_abs_final_conf": 0.7534240484237671, "adv/mean_abs_reasoning": 0.3624636232852936, "adv/mean_abs_step_conf": 0.7267440557479858, "adv/ratio_final_to_reasoning": 2.078619756639, "adv/ratio_step_to_reasoning": 2.005012390377085, "adv/std_final_conf": 0.9301952719688416, "adv/std_reasoning": 0.6403642892837524, "adv/std_step_conf": 0.9265068769454956, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.16796875, "calib/ece": 0.04310756972111554, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0017000948638026259, "calib/mean_conf": 0.6603187250996017, "calib/mu_c": 0.6609554140127388, "calib/mu_w": 0.6592553191489362, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03896414342629482, "calib/std_conf": 0.058164983852673775, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5852656104380243, "calib/step_q_c_n": 2146.0, "calib/step_q_gap": -0.01944066910282649, "calib/step_q_w": 0.6047062795408508, "calib/step_q_w_n": 1481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 833.33203125, "completions/mean_terminated_length": 849.9323120117188, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.03306666666666667, "grad_norm": 0.27699559926986694, "kl": 0.03562164306640625, "learning_rate": 4.722222222222222e-06, "loss": -0.1066, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018625345081090927, "mask/share_reasoning": 0.7823229432106018, "mask/share_step_conf": 0.17952045798301697, "num_tokens": 9657222.0, "reward": 0.9539623260498047, "reward_std": 0.10619272291660309, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8421058058738708, "rewards/final_brier_reward_step": 0.7470687627792358, "rewards/format_reward_step": 0.98046875, "step": 31 }, { "adv/mean_abs_final_conf": 0.7851700186729431, "adv/mean_abs_reasoning": 0.384133517742157, "adv/mean_abs_step_conf": 0.762548565864563, "adv/ratio_final_to_reasoning": 2.0440028854758125, "adv/ratio_step_to_reasoning": 1.9851133281642208, "adv/std_final_conf": 0.9287110567092896, "adv/std_reasoning": 0.640393078327179, "adv/std_step_conf": 0.9280316233634949, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.1953125, "calib/ece": 0.05750000000000001, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01973328899235116, "calib/mean_conf": 0.6445634920634921, "calib/mu_c": 0.6369677419354839, "calib/mu_w": 0.656701030927835, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0434920634920635, "calib/std_conf": 0.04393591943247486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.567911014650027, "calib/step_q_c_n": 1843.0, "calib/step_q_gap": -0.023131330626845914, "calib/step_q_w": 0.5910423452768729, "calib/step_q_w_n": 1535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 781.3671875, "completions/mean_terminated_length": 793.7698974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.034133333333333335, "grad_norm": 0.29690098762512207, "kl": 0.05696868896484375, "learning_rate": 4.694444444444445e-06, "loss": -0.0084, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019794052466750145, "mask/share_reasoning": 0.7795528173446655, "mask/share_step_conf": 0.18502815067768097, "num_tokens": 9963956.0, "reward": 0.9460583329200745, "reward_std": 0.0918600782752037, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8347843289375305, "rewards/final_brier_reward_step": 0.7393636703491211, "rewards/format_reward_step": 0.984375, "step": 32 }, { "adv/mean_abs_final_conf": 0.7632274627685547, "adv/mean_abs_reasoning": 0.2787264585494995, "adv/mean_abs_step_conf": 0.7655272483825684, "adv/ratio_final_to_reasoning": 2.7382669974727634, "adv/ratio_step_to_reasoning": 2.7465180462823446, "adv/std_final_conf": 0.9271504282951355, "adv/std_reasoning": 0.5726523995399475, "adv/std_step_conf": 0.9253469109535217, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 12.26171875, "calib/ece": 0.06662008032128516, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004728800886181106, "calib/mean_conf": 0.6353879518072288, "calib/mu_c": 0.6336407643312102, "calib/mu_w": 0.6383695652173913, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03574297188755021, "calib/std_conf": 0.06512854400644379, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5650610021786492, "calib/step_q_c_n": 1836.0, "calib/step_q_gap": -0.02135496098328482, "calib/step_q_w": 0.586415963161934, "calib/step_q_w_n": 1303.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 733.35546875, "completions/mean_terminated_length": 753.9718627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.0352, "grad_norm": 0.26579010486602783, "kl": 0.06195831298828125, "learning_rate": 4.666666666666667e-06, "loss": -0.0785, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020835505798459053, "mask/share_reasoning": 0.7707878351211548, "mask/share_step_conf": 0.18103289604187012, "num_tokens": 10258567.0, "reward": 0.9497272968292236, "reward_std": 0.0785764828324318, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8424965143203735, "rewards/final_brier_reward_step": 0.7397705316543579, "rewards/format_reward_step": 0.97265625, "step": 33 }, { "adv/mean_abs_final_conf": 0.76878422498703, "adv/mean_abs_reasoning": 0.39741986989974976, "adv/mean_abs_step_conf": 0.751526951789856, "adv/ratio_final_to_reasoning": 1.9344383188011156, "adv/ratio_step_to_reasoning": 1.8910150415464397, "adv/std_final_conf": 0.9294537901878357, "adv/std_reasoning": 0.6815695762634277, "adv/std_step_conf": 0.9264436364173889, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.7734375, "calib/ece": 0.1493665354330709, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.024256979368149634, "calib/mean_conf": 0.6499248031496062, "calib/mu_c": 0.6436218085106383, "calib/mu_w": 0.667878787878788, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.029566929133858263, "calib/std_conf": 0.07300219427562256, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5728191419141915, "calib/step_q_c_n": 2424.0, "calib/step_q_gap": -0.02581969656130767, "calib/step_q_w": 0.5986388384754991, "calib/step_q_w_n": 1102.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 789.125, "completions/mean_terminated_length": 795.3385620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.03626666666666667, "grad_norm": 0.2776516377925873, "kl": 0.05954742431640625, "learning_rate": 4.638888888888889e-06, "loss": -0.0325, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02026517689228058, "mask/share_reasoning": 0.778353214263916, "mask/share_step_conf": 0.1935690939426422, "num_tokens": 10565695.0, "reward": 0.994615375995636, "reward_std": 0.08916525542736053, "rewards/accuracy_reward_step": 0.734375, "rewards/asymmetric_l2_reward": 0.8651760816574097, "rewards/final_brier_reward_step": 0.7787420749664307, "rewards/format_reward_step": 0.9921875, "step": 34 }, { "adv/mean_abs_final_conf": 0.738964319229126, "adv/mean_abs_reasoning": 0.3949841856956482, "adv/mean_abs_step_conf": 0.7444653511047363, "adv/ratio_final_to_reasoning": 1.8708706474606274, "adv/ratio_step_to_reasoning": 1.8847978680300328, "adv/std_final_conf": 0.931449830532074, "adv/std_reasoning": 0.6815976500511169, "adv/std_step_conf": 0.9286965131759644, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.08984375, "calib/ece": 0.08798853754940718, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.020966805151640955, "calib/mean_conf": 0.6639569169960475, "calib/mu_c": 0.6567469879518073, "calib/mu_w": 0.6777137931034483, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04790948616600791, "calib/std_conf": 0.07293426409468073, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5800000000000001, "calib/step_q_c_n": 2168.0, "calib/step_q_gap": -0.02129583043780392, "calib/step_q_w": 0.601295830437804, "calib/step_q_w_n": 1439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 942.46484375, "completions/mean_terminated_length": 953.640380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 318.0, "epoch": 0.037333333333333336, "grad_norm": 0.533706784248352, "kl": 0.042438507080078125, "learning_rate": 4.611111111111112e-06, "loss": -0.072, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01712356135249138, "mask/share_reasoning": 0.7983591556549072, "mask/share_step_conf": 0.17279846966266632, "num_tokens": 10916222.0, "reward": 0.960017204284668, "reward_std": 0.10911445319652557, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.8412766456604004, "rewards/final_brier_reward_step": 0.7506327629089355, "rewards/format_reward_step": 0.98828125, "step": 35 }, { "adv/mean_abs_final_conf": 0.7283240556716919, "adv/mean_abs_reasoning": 0.2501176595687866, "adv/mean_abs_step_conf": 0.7294330596923828, "adv/ratio_final_to_reasoning": 2.911925758970211, "adv/ratio_step_to_reasoning": 2.9163596882761342, "adv/std_final_conf": 0.9135305285453796, "adv/std_reasoning": 0.5228530168533325, "adv/std_step_conf": 0.9238290786743164, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.0234375, "calib/ece": 0.21769881422924908, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.025325840668020905, "calib/mean_conf": 0.6448308300395257, "calib/mu_c": 0.6406265402843601, "calib/mu_w": 0.665952380952381, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014268774703557308, "calib/std_conf": 0.08446825676718633, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5701948612153038, "calib/step_q_c_n": 2666.0, "calib/step_q_gap": -0.02612250405415728, "calib/step_q_w": 0.5963173652694611, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 788.94140625, "completions/mean_terminated_length": 798.2964477539062, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.0384, "grad_norm": 0.3193471133708954, "kl": 0.0623931884765625, "learning_rate": 4.583333333333333e-06, "loss": -0.0279, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020868193358182907, "mask/share_reasoning": 0.7738605737686157, "mask/share_step_conf": 0.19355246424674988, "num_tokens": 11220903.0, "reward": 1.0232408046722412, "reward_std": 0.07451615482568741, "rewards/accuracy_reward_step": 0.82421875, "rewards/asymmetric_l2_reward": 0.8818714618682861, "rewards/final_brier_reward_step": 0.802109956741333, "rewards/format_reward_step": 0.98828125, "step": 36 }, { "adv/mean_abs_final_conf": 0.7440776824951172, "adv/mean_abs_reasoning": 0.35688239336013794, "adv/mean_abs_step_conf": 0.7341234683990479, "adv/ratio_final_to_reasoning": 2.0849380533722544, "adv/ratio_step_to_reasoning": 2.0570459122039892, "adv/std_final_conf": 0.9291855692863464, "adv/std_reasoning": 0.6612588763237, "adv/std_step_conf": 0.927453339099884, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.41015625, "calib/ece": 0.12987951807228912, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.037745098039215774, "calib/mean_conf": 0.645140562248996, "calib/mu_c": 0.6305882352941176, "calib/mu_w": 0.6683333333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08028112449799196, "calib/std_conf": 0.0679835817741941, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.571936525013448, "calib/step_q_c_n": 1859.0, "calib/step_q_gap": -0.031103365696934393, "calib/step_q_w": 0.6030398907103824, "calib/step_q_w_n": 1830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 871.140625, "completions/mean_terminated_length": 888.4940795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 0.039466666666666664, "grad_norm": 0.499504953622818, "kl": 0.0570526123046875, "learning_rate": 4.555555555555556e-06, "loss": -0.1091, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018286598846316338, "mask/share_reasoning": 0.7885184288024902, "mask/share_step_conf": 0.17366373538970947, "num_tokens": 11551011.0, "reward": 0.9410471320152283, "reward_std": 0.1150817796587944, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.8486028909683228, "rewards/final_brier_reward_step": 0.7194288969039917, "rewards/format_reward_step": 0.97265625, "step": 37 }, { "adv/mean_abs_final_conf": 0.7383832931518555, "adv/mean_abs_reasoning": 0.39670035243034363, "adv/mean_abs_step_conf": 0.7383660674095154, "adv/ratio_final_to_reasoning": 1.8613124203904199, "adv/ratio_step_to_reasoning": 1.8612689978367605, "adv/std_final_conf": 0.9327216744422913, "adv/std_reasoning": 0.6819123029708862, "adv/std_step_conf": 0.9317123889923096, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 16.00390625, "calib/ece": 0.11084033613445371, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.008403361344537815, "calib/gap": -0.024425072644250645, "calib/mean_conf": 0.6599159663865545, "calib/mu_c": 0.6524242424242425, "calib/mu_w": 0.6768493150684931, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03873949579831933, "calib/std_conf": 0.07487800496425798, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5802545118000925, "calib/step_q_c_n": 2161.0, "calib/step_q_gap": -0.058733091505692614, "calib/step_q_w": 0.6389876033057851, "calib/step_q_w_n": 1936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 847.14453125, "completions/mean_terminated_length": 907.4016723632812, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.04053333333333333, "grad_norm": 0.3331059515476227, "kl": 0.05327606201171875, "learning_rate": 4.527777777777778e-06, "loss": -0.399, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.016576403751969337, "mask/share_reasoning": 0.746181309223175, "mask/share_step_conf": 0.17083601653575897, "num_tokens": 11874768.0, "reward": 0.9161571264266968, "reward_std": 0.16508501768112183, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.8005986213684082, "rewards/final_brier_reward_step": 0.7160905599594116, "rewards/format_reward_step": 0.9296875, "step": 38 }, { "adv/mean_abs_final_conf": 0.7298065423965454, "adv/mean_abs_reasoning": 0.41687560081481934, "adv/mean_abs_step_conf": 0.7506943941116333, "adv/ratio_final_to_reasoning": 1.7506578484566513, "adv/ratio_step_to_reasoning": 1.8007635674631384, "adv/std_final_conf": 0.9323946833610535, "adv/std_reasoning": 0.7013625502586365, "adv/std_step_conf": 0.9290485978126526, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.22265625, "calib/ece": 0.10350806451612897, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.008064516129032258, "calib/gap": -0.038975675675675836, "calib/mean_conf": 0.6425403225806451, "calib/mu_c": 0.6268243243243243, "calib/mu_w": 0.6658000000000002, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07463709677419354, "calib/std_conf": 0.08244009547441963, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5571152815013405, "calib/step_q_c_n": 1865.0, "calib/step_q_gap": -0.04928235629393518, "calib/step_q_w": 0.6063976377952757, "calib/step_q_w_n": 2032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 873.58203125, "completions/mean_terminated_length": 901.7620849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.0416, "grad_norm": 0.4956228733062744, "kl": 0.06043243408203125, "learning_rate": 4.5e-06, "loss": -0.1951, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018304044380784035, "mask/share_reasoning": 0.7745673656463623, "mask/share_step_conf": 0.1758786141872406, "num_tokens": 12204493.0, "reward": 0.9280663728713989, "reward_std": 0.10901758074760437, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8379073739051819, "rewards/final_brier_reward_step": 0.7088503837585449, "rewards/format_reward_step": 0.96875, "step": 39 }, { "adv/mean_abs_final_conf": 0.7641886472702026, "adv/mean_abs_reasoning": 0.4124321937561035, "adv/mean_abs_step_conf": 0.7539259791374207, "adv/ratio_final_to_reasoning": 1.8528831134897155, "adv/ratio_step_to_reasoning": 1.8279998277323215, "adv/std_final_conf": 0.9351666569709778, "adv/std_reasoning": 0.7014144062995911, "adv/std_step_conf": 0.9318673014640808, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 16.4609375, "calib/ece": 0.09181200000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.004, "calib/gap": -0.005351805042452473, "calib/mean_conf": 0.617628, "calib/mu_c": 0.615251798561151, "calib/mu_w": 0.6206036036036034, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07672000000000002, "calib/std_conf": 0.08538108464993871, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5290776470588235, "calib/step_q_c_n": 2125.0, "calib/step_q_gap": -0.024259834990003615, "calib/step_q_w": 0.5533374820488272, "calib/step_q_w_n": 2089.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 998.48046875, "completions/mean_terminated_length": 1018.3705444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.042666666666666665, "grad_norm": 0.5073999762535095, "kl": 0.0644989013671875, "learning_rate": 4.472222222222223e-06, "loss": -0.0924, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016091875731945038, "mask/share_reasoning": 0.7875421047210693, "mask/share_step_conf": 0.17683476209640503, "num_tokens": 12566864.0, "reward": 0.931105375289917, "reward_std": 0.12366892397403717, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8362286686897278, "rewards/final_brier_reward_step": 0.722075879573822, "rewards/format_reward_step": 0.9765625, "step": 40 }, { "adv/mean_abs_final_conf": 0.7500344514846802, "adv/mean_abs_reasoning": 0.28220200538635254, "adv/mean_abs_step_conf": 0.7471286654472351, "adv/ratio_final_to_reasoning": 2.6577927766949605, "adv/ratio_step_to_reasoning": 2.6474959468284727, "adv/std_final_conf": 0.9347693920135498, "adv/std_reasoning": 0.5727576017379761, "adv/std_step_conf": 0.9301022291183472, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.68359375, "calib/ece": 0.35478260869565215, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007446681044811998, "calib/mean_conf": 0.4965217391304348, "calib/mu_c": 0.4953738317757009, "calib/mu_w": 0.5028205128205129, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0027272727272727275, "calib/std_conf": 0.10131427117252774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42474197040779504, "calib/step_q_c_n": 2771.0, "calib/step_q_gap": -0.01013507877253278, "calib/step_q_w": 0.4348770491803278, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 842.11328125, "completions/mean_terminated_length": 852.098876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.04373333333333333, "grad_norm": 0.24111762642860413, "kl": 0.07025146484375, "learning_rate": 4.444444444444444e-06, "loss": -0.0857, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019385291263461113, "mask/share_reasoning": 0.788033664226532, "mask/share_step_conf": 0.18086230754852295, "num_tokens": 12889693.0, "reward": 0.9926100969314575, "reward_std": 0.0846317782998085, "rewards/accuracy_reward_step": 0.8359375, "rewards/asymmetric_l2_reward": 0.893618643283844, "rewards/final_brier_reward_step": 0.726757824420929, "rewards/format_reward_step": 0.98828125, "step": 41 }, { "adv/mean_abs_final_conf": 0.7757542133331299, "adv/mean_abs_reasoning": 0.29600927233695984, "adv/mean_abs_step_conf": 0.7710269093513489, "adv/ratio_final_to_reasoning": 2.620709166333331, "adv/ratio_step_to_reasoning": 2.6047390450446986, "adv/std_final_conf": 0.9351115822792053, "adv/std_reasoning": 0.5726499557495117, "adv/std_step_conf": 0.9305152297019958, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.046875, "calib/ece": 0.24106719367588936, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.009820371935756467, "calib/mean_conf": 0.4269169960474308, "calib/mu_c": 0.43017751479289934, "calib/mu_w": 0.4203571428571429, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.08885737688312316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3523915843709747, "calib/step_q_c_n": 2329.0, "calib/step_q_gap": -0.006203522179932974, "calib/step_q_w": 0.35859510655090765, "calib/step_q_w_n": 1267.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 819.4921875, "completions/mean_terminated_length": 829.2095336914062, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.0448, "grad_norm": 0.32992997765541077, "kl": 0.0951385498046875, "learning_rate": 4.416666666666667e-06, "loss": -0.074, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018915589898824692, "mask/share_reasoning": 0.7854932546615601, "mask/share_step_conf": 0.18387237191200256, "num_tokens": 13203851.0, "reward": 0.9707211256027222, "reward_std": 0.07159927487373352, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.9035864472389221, "rewards/final_brier_reward_step": 0.7081683874130249, "rewards/format_reward_step": 0.98828125, "step": 42 }, { "adv/mean_abs_final_conf": 0.7651442289352417, "adv/mean_abs_reasoning": 0.33971551060676575, "adv/mean_abs_step_conf": 0.7557969093322754, "adv/ratio_final_to_reasoning": 2.2523087849848773, "adv/ratio_step_to_reasoning": 2.2247936456664776, "adv/std_final_conf": 0.9350785613059998, "adv/std_reasoning": 0.6402810215950012, "adv/std_step_conf": 0.9280475974082947, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.84375, "calib/ece": 0.3594422310756972, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0024231565535914323, "calib/mean_conf": 0.37059760956175297, "calib/mu_c": 0.3712637362637363, "calib/mu_w": 0.3688405797101449, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0024701195219123504, "calib/std_conf": 0.08685772491123588, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28983962936564506, "calib/step_q_c_n": 2806.0, "calib/step_q_gap": -0.023432370634354993, "calib/step_q_w": 0.31327200000000005, "calib/step_q_w_n": 1250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 910.95703125, "completions/mean_terminated_length": 929.1036376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 0.04586666666666667, "grad_norm": 0.22645698487758636, "kl": 0.08038330078125, "learning_rate": 4.388888888888889e-06, "loss": -0.1327, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017826823517680168, "mask/share_reasoning": 0.7836995124816895, "mask/share_step_conf": 0.17894247174263, "num_tokens": 13542280.0, "reward": 0.9442841410636902, "reward_std": 0.0882578194141388, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.894922137260437, "rewards/final_brier_reward_step": 0.6553648114204407, "rewards/format_reward_step": 0.98046875, "step": 43 }, { "adv/mean_abs_final_conf": 0.7774562835693359, "adv/mean_abs_reasoning": 0.44238120317459106, "adv/mean_abs_step_conf": 0.7424235939979553, "adv/ratio_final_to_reasoning": 1.7574351667525607, "adv/ratio_step_to_reasoning": 1.6782439865668273, "adv/std_final_conf": 0.9354961514472961, "adv/std_reasoning": 0.7393777966499329, "adv/std_step_conf": 0.930855393409729, "calib/answer_extract_rate": 0.94921875, "calib/avg_num_step_conf": 16.7578125, "calib/ece": 0.262396694214876, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.009137529137529132, "calib/mean_conf": 0.3475206611570248, "calib/mu_c": 0.35125874125874124, "calib/mu_w": 0.3421212121212121, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009504132231404953, "calib/std_conf": 0.08982334353713944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26401697944593394, "calib/step_q_c_n": 2238.0, "calib/step_q_gap": -0.01742551568077172, "calib/step_q_w": 0.28144249512670566, "calib/step_q_w_n": 2052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 954.92578125, "completions/mean_terminated_length": 1006.0123291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.046933333333333334, "grad_norm": 0.4708811640739441, "kl": 0.093780517578125, "learning_rate": 4.361111111111112e-06, "loss": -0.2453, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01548642385751009, "mask/share_reasoning": 0.7631728649139404, "mask/share_step_conf": 0.17055943608283997, "num_tokens": 13893061.0, "reward": 0.9163950681686401, "reward_std": 0.14778360724449158, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.8746612071990967, "rewards/final_brier_reward_step": 0.6573476791381836, "rewards/format_reward_step": 0.9453125, "step": 44 }, { "adv/mean_abs_final_conf": 0.772063136100769, "adv/mean_abs_reasoning": 0.44855812191963196, "adv/mean_abs_step_conf": 0.7616181969642639, "adv/ratio_final_to_reasoning": 1.7212109164285725, "adv/ratio_step_to_reasoning": 1.6979253295088543, "adv/std_final_conf": 0.935443103313446, "adv/std_reasoning": 0.7206319570541382, "adv/std_step_conf": 0.930956244468689, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 16.19921875, "calib/ece": 0.33060000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.019194141836808343, "calib/mean_conf": 0.3374, "calib/mu_c": 0.3437724550898204, "calib/mu_w": 0.32457831325301206, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.09430185576116729, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25661035226455786, "calib/step_q_c_n": 2782.0, "calib/step_q_gap": -0.012481222827017247, "calib/step_q_w": 0.2690915750915751, "calib/step_q_w_n": 1365.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 917.94140625, "completions/mean_terminated_length": 939.9720458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.048, "grad_norm": 0.3054089844226837, "kl": 0.08504486083984375, "learning_rate": 4.333333333333334e-06, "loss": -0.1635, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017406273633241653, "mask/share_reasoning": 0.7744444608688354, "mask/share_step_conf": 0.184711754322052, "num_tokens": 14233102.0, "reward": 0.9349931478500366, "reward_std": 0.108258917927742, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.891325831413269, "rewards/final_brier_reward_step": 0.6528792977333069, "rewards/format_reward_step": 0.9765625, "step": 45 }, { "adv/mean_abs_final_conf": 0.7224528789520264, "adv/mean_abs_reasoning": 0.3622780740261078, "adv/mean_abs_step_conf": 0.7105721831321716, "adv/ratio_final_to_reasoning": 1.9941943240538547, "adv/ratio_step_to_reasoning": 1.9613999137054146, "adv/std_final_conf": 0.920945942401886, "adv/std_reasoning": 0.64040607213974, "adv/std_step_conf": 0.9170863032341003, "calib/answer_extract_rate": 0.90234375, "calib/avg_num_step_conf": 17.59375, "calib/ece": 0.3652608695652174, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0043047238855621894, "calib/mean_conf": 0.36534782608695654, "calib/mu_c": 0.3665269461077844, "calib/mu_w": 0.3622222222222222, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0022608695652173915, "calib/std_conf": 0.10848606750684711, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28078593996840445, "calib/step_q_c_n": 2532.0, "calib/step_q_gap": -0.0552434717963014, "calib/step_q_w": 0.33602941176470585, "calib/step_q_w_n": 1972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 873.8671875, "completions/mean_terminated_length": 956.0256958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.04906666666666667, "grad_norm": 0.2894684374332428, "kl": 0.0829620361328125, "learning_rate": 4.305555555555556e-06, "loss": -0.3204, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.01669793389737606, "mask/share_reasoning": 0.7243597507476807, "mask/share_step_conf": 0.17300483584403992, "num_tokens": 14561580.0, "reward": 0.8605321645736694, "reward_std": 0.13566331565380096, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.8171077966690063, "rewards/final_brier_reward_step": 0.59380042552948, "rewards/format_reward_step": 0.8984375, "step": 46 }, { "adv/mean_abs_final_conf": 0.7682064771652222, "adv/mean_abs_reasoning": 0.33441025018692017, "adv/mean_abs_step_conf": 0.7347837686538696, "adv/ratio_final_to_reasoning": 2.297197758549056, "adv/ratio_step_to_reasoning": 2.197252531114578, "adv/std_final_conf": 0.9354805946350098, "adv/std_reasoning": 0.6404110789299011, "adv/std_step_conf": 0.9310937523841858, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.92578125, "calib/ece": 0.28800813008130083, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0193512906846241, "calib/mean_conf": 0.3880081300813008, "calib/mu_c": 0.3816363636363636, "calib/mu_w": 0.4009876543209877, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0026422764227642293, "calib/std_conf": 0.10860018385758737, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30016927634363094, "calib/step_q_c_n": 2363.0, "calib/step_q_gap": -0.01845966181272851, "calib/step_q_w": 0.31862893815635945, "calib/step_q_w_n": 1714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 924.41796875, "completions/mean_terminated_length": 958.1012573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.050133333333333335, "grad_norm": 0.23397418856620789, "kl": 0.08563995361328125, "learning_rate": 4.277777777777778e-06, "loss": -0.1835, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016216028481721878, "mask/share_reasoning": 0.7742945551872253, "mask/share_step_conf": 0.17433317005634308, "num_tokens": 14904207.0, "reward": 0.9234906435012817, "reward_std": 0.1315256506204605, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.873530924320221, "rewards/final_brier_reward_step": 0.6523566246032715, "rewards/format_reward_step": 0.9609375, "step": 47 }, { "adv/mean_abs_final_conf": 0.7578040361404419, "adv/mean_abs_reasoning": 0.45148777961730957, "adv/mean_abs_step_conf": 0.7425320148468018, "adv/ratio_final_to_reasoning": 1.678459684518532, "adv/ratio_step_to_reasoning": 1.6446336941305195, "adv/std_final_conf": 0.9358941912651062, "adv/std_reasoning": 0.7205584049224854, "adv/std_step_conf": 0.932444155216217, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.4765625, "calib/ece": 0.2909625, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0357313987657949, "calib/mean_conf": 0.4104891129032258, "calib/mu_c": 0.3986746987951807, "calib/mu_w": 0.4344060975609756, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01604838709677418, "calib/std_conf": 0.12742965106356055, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3295223214285714, "calib/step_q_c_n": 2240.0, "calib/step_q_gap": -0.046703804083024714, "calib/step_q_w": 0.3762261255115961, "calib/step_q_w_n": 1466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 825.63671875, "completions/mean_terminated_length": 848.8473510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.0512, "grad_norm": 0.4460994303226471, "kl": 0.09588623046875, "learning_rate": 4.25e-06, "loss": -0.1168, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018663503229618073, "mask/share_reasoning": 0.7787596583366394, "mask/share_step_conf": 0.1752331256866455, "num_tokens": 15219258.0, "reward": 0.9209808111190796, "reward_std": 0.11792835593223572, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.8601468801498413, "rewards/final_brier_reward_step": 0.6583772301673889, "rewards/format_reward_step": 0.96875, "step": 48 }, { "adv/mean_abs_final_conf": 0.7847864627838135, "adv/mean_abs_reasoning": 0.4784793257713318, "adv/mean_abs_step_conf": 0.7436771988868713, "adv/ratio_final_to_reasoning": 1.6401679665442175, "adv/ratio_step_to_reasoning": 1.5542514771939786, "adv/std_final_conf": 0.9357584118843079, "adv/std_reasoning": 0.7392579317092896, "adv/std_step_conf": 0.9319802522659302, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.0, "calib/ece": 0.26744841269841274, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.006241876708168892, "calib/mean_conf": 0.46072619047619046, "calib/mu_c": 0.45889325842696627, "calib/mu_w": 0.46513513513513516, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010912698412698412, "calib/std_conf": 0.12720399981143363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37630826140567203, "calib/step_q_c_n": 2433.0, "calib/step_q_gap": -0.019339001843676373, "calib/step_q_w": 0.3956472632493484, "calib/step_q_w_n": 1151.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 854.80078125, "completions/mean_terminated_length": 864.936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.05226666666666667, "grad_norm": 0.2933165729045868, "kl": 0.0948944091796875, "learning_rate": 4.222222222222223e-06, "loss": -0.0183, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018302440643310547, "mask/share_reasoning": 0.7933739423751831, "mask/share_step_conf": 0.17660485208034515, "num_tokens": 15542623.0, "reward": 0.965506911277771, "reward_std": 0.10440301150083542, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8927453756332397, "rewards/final_brier_reward_step": 0.7023309469223022, "rewards/format_reward_step": 0.984375, "step": 49 }, { "adv/mean_abs_final_conf": 0.7457829117774963, "adv/mean_abs_reasoning": 0.4013625979423523, "adv/mean_abs_step_conf": 0.7342361211776733, "adv/ratio_final_to_reasoning": 1.8581275774097246, "adv/ratio_step_to_reasoning": 1.8293586022759691, "adv/std_final_conf": 0.9355961084365845, "adv/std_reasoning": 0.6816041469573975, "adv/std_step_conf": 0.9320106506347656, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.40234375, "calib/ece": 0.2155905511811023, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.015748031496062992, "calib/gap": 0.011760792430514533, "calib/mean_conf": 0.5181102362204724, "calib/mu_c": 0.5216292134831461, "calib/mu_w": 0.5098684210526315, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.01645669291338583, "calib/std_conf": 0.14762546930173795, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44385708485586683, "calib/step_q_c_n": 2463.0, "calib/step_q_gap": -0.013773634098381504, "calib/step_q_w": 0.45763071895424834, "calib/step_q_w_n": 1224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 884.87109375, "completions/mean_terminated_length": 888.3412475585938, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.05333333333333334, "grad_norm": 0.5204945206642151, "kl": 0.1044921875, "learning_rate": 4.194444444444445e-06, "loss": -0.0222, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018934890627861023, "mask/share_reasoning": 0.7971692085266113, "mask/share_step_conf": 0.17998963594436646, "num_tokens": 15874510.0, "reward": 0.9724473357200623, "reward_std": 0.10486334562301636, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8775556087493896, "rewards/final_brier_reward_step": 0.730620265007019, "rewards/format_reward_step": 0.98828125, "step": 50 }, { "adv/mean_abs_final_conf": 0.7597240209579468, "adv/mean_abs_reasoning": 0.4087080955505371, "adv/mean_abs_step_conf": 0.7495004534721375, "adv/ratio_final_to_reasoning": 1.858842604853679, "adv/ratio_step_to_reasoning": 1.8338282545212297, "adv/std_final_conf": 0.9355056285858154, "adv/std_reasoning": 0.6816107630729675, "adv/std_step_conf": 0.9318075776100159, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.7265625, "calib/ece": 0.2642350393700787, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.0370575772849463, "calib/mean_conf": 0.525213779527559, "calib/mu_c": 0.5161682291666666, "calib/mu_w": 0.5532258064516129, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016771653543307095, "calib/std_conf": 0.1314685058805761, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4429823792123426, "calib/step_q_c_n": 2463.0, "calib/step_q_gap": -0.025647497095935223, "calib/step_q_w": 0.46862987630827785, "calib/step_q_w_n": 1051.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 870.94921875, "completions/mean_terminated_length": 877.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.0544, "grad_norm": 0.314399778842926, "kl": 0.101898193359375, "learning_rate": 4.166666666666667e-06, "loss": -0.0247, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01864321529865265, "mask/share_reasoning": 0.798714280128479, "mask/share_step_conf": 0.17483000457286835, "num_tokens": 16206769.0, "reward": 0.9772302508354187, "reward_std": 0.10499308258295059, "rewards/accuracy_reward_step": 0.75, "rewards/asymmetric_l2_reward": 0.8804265260696411, "rewards/final_brier_reward_step": 0.7255964279174805, "rewards/format_reward_step": 0.9921875, "step": 51 }, { "adv/mean_abs_final_conf": 0.7534183263778687, "adv/mean_abs_reasoning": 0.2664984166622162, "adv/mean_abs_step_conf": 0.7155110239982605, "adv/ratio_final_to_reasoning": 2.8271024489155523, "adv/ratio_step_to_reasoning": 2.684860319095865, "adv/std_final_conf": 0.9353107213973999, "adv/std_reasoning": 0.5726114511489868, "adv/std_step_conf": 0.9330041408538818, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.9140625, "calib/ece": 0.23878924302788848, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0199203187250996, "calib/gap": -0.01161825602968447, "calib/mean_conf": 0.5542784860557769, "calib/mu_c": 0.5517326530612245, "calib/mu_w": 0.563350909090909, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.006095617529880484, "calib/std_conf": 0.1473711197877955, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4724851987023519, "calib/step_q_c_n": 2466.0, "calib/step_q_gap": -0.016751753852392637, "calib/step_q_w": 0.48923695255474453, "calib/step_q_w_n": 1096.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 938.52734375, "completions/mean_terminated_length": 957.22314453125, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.055466666666666664, "grad_norm": 0.3702726364135742, "kl": 0.1003875732421875, "learning_rate": 4.138888888888889e-06, "loss": -0.1376, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017278529703617096, "mask/share_reasoning": 0.7905373573303223, "mask/share_step_conf": 0.17265285551548004, "num_tokens": 16554984.0, "reward": 0.9832615256309509, "reward_std": 0.10831600427627563, "rewards/accuracy_reward_step": 0.765625, "rewards/asymmetric_l2_reward": 0.8801380395889282, "rewards/final_brier_reward_step": 0.737166166305542, "rewards/format_reward_step": 0.98046875, "step": 52 }, { "adv/mean_abs_final_conf": 0.7536748647689819, "adv/mean_abs_reasoning": 0.3818695545196533, "adv/mean_abs_step_conf": 0.7461249232292175, "adv/ratio_final_to_reasoning": 1.9736448110324367, "adv/ratio_step_to_reasoning": 1.953873814757907, "adv/std_final_conf": 0.9351969361305237, "adv/std_reasoning": 0.6403175592422485, "adv/std_step_conf": 0.9318367838859558, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.28125, "calib/ece": 0.2515141176470588, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.02755208089587169, "calib/mean_conf": 0.5222113725490196, "calib/mu_c": 0.5155124352331606, "calib/mu_w": 0.5430645161290323, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.008431372549019607, "calib/std_conf": 0.11159081796894785, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4316606194690265, "calib/step_q_c_n": 2260.0, "calib/step_q_gap": -0.010703633924638578, "calib/step_q_w": 0.4423642533936651, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 925.9296875, "completions/mean_terminated_length": 929.5608520507812, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.05653333333333333, "grad_norm": 0.49424004554748535, "kl": 0.1109771728515625, "learning_rate": 4.111111111111111e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017200004309415817, "mask/share_reasoning": 0.8054446578025818, "mask/share_step_conf": 0.1734490692615509, "num_tokens": 16897846.0, "reward": 0.9913618564605713, "reward_std": 0.07794886827468872, "rewards/accuracy_reward_step": 0.75390625, "rewards/asymmetric_l2_reward": 0.8972834944725037, "rewards/final_brier_reward_step": 0.7354402542114258, "rewards/format_reward_step": 0.99609375, "step": 53 }, { "adv/mean_abs_final_conf": 0.742029070854187, "adv/mean_abs_reasoning": 0.249742791056633, "adv/mean_abs_step_conf": 0.7290786504745483, "adv/ratio_final_to_reasoning": 2.9711731326247595, "adv/ratio_step_to_reasoning": 2.9193181007944236, "adv/std_final_conf": 0.9349008202552795, "adv/std_reasoning": 0.548269510269165, "adv/std_step_conf": 0.9322406649589539, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.65625, "calib/ece": 0.37767716535433077, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004044392523364415, "calib/mean_conf": 0.4648425196850394, "calib/mu_c": 0.46420560747663553, "calib/mu_w": 0.46824999999999994, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0, "calib/std_conf": 0.08772594451861253, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3908158508158508, "calib/step_q_c_n": 2145.0, "calib/step_q_gap": -0.0027519021858644632, "calib/step_q_w": 0.39356775300171526, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2491.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 782.62890625, "completions/mean_terminated_length": 785.6980590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.0576, "grad_norm": 1.06956148147583, "kl": 0.124053955078125, "learning_rate": 4.083333333333334e-06, "loss": -0.009, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021347707137465477, "mask/share_reasoning": 0.8023233413696289, "mask/share_step_conf": 0.17242267727851868, "num_tokens": 17204431.0, "reward": 0.9782155752182007, "reward_std": 0.07600846886634827, "rewards/accuracy_reward_step": 0.8359375, "rewards/asymmetric_l2_reward": 0.8835257291793823, "rewards/final_brier_reward_step": 0.708061695098877, "rewards/format_reward_step": 0.98828125, "step": 54 }, { "adv/mean_abs_final_conf": 0.7585951089859009, "adv/mean_abs_reasoning": 0.38029834628105164, "adv/mean_abs_step_conf": 0.7479252815246582, "adv/ratio_final_to_reasoning": 1.9947368070469516, "adv/ratio_step_to_reasoning": 1.9666803414704292, "adv/std_final_conf": 0.9352501034736633, "adv/std_reasoning": 0.6614001989364624, "adv/std_step_conf": 0.9326197504997253, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 12.18359375, "calib/ece": 0.26363934426229507, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03438524531795201, "calib/mean_conf": 0.427344262295082, "calib/mu_c": 0.41423841059602645, "calib/mu_w": 0.44862365591397846, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.036065573770491806, "calib/std_conf": 0.08968851110624923, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34170257123002085, "calib/step_q_c_n": 1439.0, "calib/step_q_gap": -0.0464777859128363, "calib/step_q_w": 0.38818035714285715, "calib/step_q_w_n": 1680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 844.01171875, "completions/mean_terminated_length": 881.9060668945312, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.058666666666666666, "grad_norm": 0.28148743510246277, "kl": 0.1166229248046875, "learning_rate": 4.055555555555556e-06, "loss": -0.2742, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018118858337402344, "mask/share_reasoning": 0.7682849168777466, "mask/share_step_conf": 0.17062747478485107, "num_tokens": 17528322.0, "reward": 0.9282364845275879, "reward_std": 0.13289257884025574, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.8768744468688965, "rewards/final_brier_reward_step": 0.6702234745025635, "rewards/format_reward_step": 0.953125, "step": 55 }, { "adv/mean_abs_final_conf": 0.7894688844680786, "adv/mean_abs_reasoning": 0.46479547023773193, "adv/mean_abs_step_conf": 0.736449122428894, "adv/ratio_final_to_reasoning": 1.6985296437254087, "adv/ratio_step_to_reasoning": 1.5844584760092815, "adv/std_final_conf": 0.9351093769073486, "adv/std_reasoning": 0.7207304239273071, "adv/std_step_conf": 0.9316466450691223, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 10.07421875, "calib/ece": 0.28452000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0038453815261044744, "calib/mean_conf": 0.38827999999999996, "calib/mu_c": 0.38698795180722895, "calib/mu_w": 0.3908333333333334, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.004399999999999999, "calib/std_conf": 0.09335438714918545, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3113822284908322, "calib/step_q_c_n": 1418.0, "calib/step_q_gap": -0.019659976504861176, "calib/step_q_w": 0.33104220499569337, "calib/step_q_w_n": 1161.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 835.296875, "completions/mean_terminated_length": 855.3440551757812, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.05973333333333333, "grad_norm": 0.5706960558891296, "kl": 0.1063995361328125, "learning_rate": 4.027777777777779e-06, "loss": -0.1725, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018997922539711, "mask/share_reasoning": 0.7841348648071289, "mask/share_step_conf": 0.1734296977519989, "num_tokens": 17848998.0, "reward": 0.9477958679199219, "reward_std": 0.11178995668888092, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.9028053283691406, "rewards/final_brier_reward_step": 0.6693488359451294, "rewards/format_reward_step": 0.96875, "step": 56 }, { "adv/mean_abs_final_conf": 0.7547876834869385, "adv/mean_abs_reasoning": 0.3574499487876892, "adv/mean_abs_step_conf": 0.7497657537460327, "adv/ratio_final_to_reasoning": 2.1115898492833516, "adv/ratio_step_to_reasoning": 2.09754052641189, "adv/std_final_conf": 0.9353575706481934, "adv/std_reasoning": 0.6404086947441101, "adv/std_step_conf": 0.9325487017631531, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 8.1953125, "calib/ece": 0.4260080645161291, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.000868628858578635, "calib/mean_conf": 0.37641129032258064, "calib/mu_c": 0.3765829145728643, "calib/mu_w": 0.37571428571428567, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0, "calib/std_conf": 0.07295894914008418, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.30721355932203387, "calib/step_q_c_n": 1475.0, "calib/step_q_gap": -0.017328976793535955, "calib/step_q_w": 0.3245425361155698, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 801.04296875, "completions/mean_terminated_length": 823.5621948242188, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.0608, "grad_norm": 0.4064134657382965, "kl": 0.102325439453125, "learning_rate": 4.000000000000001e-06, "loss": -0.192, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018787814304232597, "mask/share_reasoning": 0.7746788263320923, "mask/share_step_conf": 0.17918963730335236, "num_tokens": 18160857.0, "reward": 0.9296246767044067, "reward_std": 0.1274452805519104, "rewards/accuracy_reward_step": 0.77734375, "rewards/asymmetric_l2_reward": 0.8837900161743164, "rewards/final_brier_reward_step": 0.6278031468391418, "rewards/format_reward_step": 0.9609375, "step": 57 }, { "adv/mean_abs_final_conf": 0.748571515083313, "adv/mean_abs_reasoning": 0.5739437937736511, "adv/mean_abs_step_conf": 0.763023853302002, "adv/ratio_final_to_reasoning": 1.3042592727791227, "adv/ratio_step_to_reasoning": 1.3294400280646979, "adv/std_final_conf": 0.9355748891830444, "adv/std_reasoning": 0.8264947533607483, "adv/std_step_conf": 0.9319159388542175, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 8.5234375, "calib/ece": 0.2677510040160642, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": 0.016079664570230523, "calib/mean_conf": 0.3791566265060241, "calib/mu_c": 0.38496855345911946, "calib/mu_w": 0.36888888888888893, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.004176706827309236, "calib/std_conf": 0.10478746689915559, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2958849557522124, "calib/step_q_c_n": 1130.0, "calib/step_q_gap": -0.028116945388472037, "calib/step_q_w": 0.32400190114068445, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 908.76953125, "completions/mean_terminated_length": 934.3172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.06186666666666667, "grad_norm": 0.3297543227672577, "kl": 0.1118927001953125, "learning_rate": 3.972222222222223e-06, "loss": -0.1075, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017441056668758392, "mask/share_reasoning": 0.7803479433059692, "mask/share_step_conf": 0.17486727237701416, "num_tokens": 18499822.0, "reward": 0.9434191584587097, "reward_std": 0.11306974291801453, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.8928941488265991, "rewards/final_brier_reward_step": 0.6759753823280334, "rewards/format_reward_step": 0.96875, "step": 58 }, { "adv/mean_abs_final_conf": 0.7699183225631714, "adv/mean_abs_reasoning": 0.3834386467933655, "adv/mean_abs_step_conf": 0.7528090476989746, "adv/ratio_final_to_reasoning": 2.0079309401957057, "adv/ratio_step_to_reasoning": 1.963310307906084, "adv/std_final_conf": 0.9350104928016663, "adv/std_reasoning": 0.6814318299293518, "adv/std_step_conf": 0.9274932742118835, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.5078125, "calib/ece": 0.34398437499999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.009840909090909067, "calib/mean_conf": 0.346328125, "calib/mu_c": 0.34940340909090906, "calib/mu_w": 0.3395625, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0014062500000000002, "calib/std_conf": 0.08510330205981655, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26401229148375766, "calib/step_q_c_n": 1139.0, "calib/step_q_gap": 0.0068334919946133055, "calib/step_q_w": 0.25717879948914435, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 825.96875, "completions/mean_terminated_length": 832.472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.06293333333333333, "grad_norm": 0.3263868987560272, "kl": 0.119873046875, "learning_rate": 3.944444444444445e-06, "loss": -0.008, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020793341100215912, "mask/share_reasoning": 0.7896591424942017, "mask/share_step_conf": 0.18173500895500183, "num_tokens": 18817518.0, "reward": 0.9638254046440125, "reward_std": 0.05267930030822754, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.924406886100769, "rewards/final_brier_reward_step": 0.6657439470291138, "rewards/format_reward_step": 1.0, "step": 59 }, { "adv/mean_abs_final_conf": 0.7652968168258667, "adv/mean_abs_reasoning": 0.49463218450546265, "adv/mean_abs_step_conf": 0.7708863019943237, "adv/ratio_final_to_reasoning": 1.5472038431769595, "adv/ratio_step_to_reasoning": 1.5585041292148472, "adv/std_final_conf": 0.935594916343689, "adv/std_reasoning": 0.7395151853561401, "adv/std_step_conf": 0.9312669038772583, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 8.2578125, "calib/ece": 0.3519102040816326, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.00548906738063365, "calib/mean_conf": 0.31943673469387757, "calib/mu_c": 0.3212962962962963, "calib/mu_w": 0.31580722891566265, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0050612244897959196, "calib/std_conf": 0.09404311363407125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2572097209720972, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.0019161018295248966, "calib/step_q_w": 0.2552936191425723, "calib/step_q_w_n": 1003.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 826.984375, "completions/mean_terminated_length": 853.6612548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.064, "grad_norm": 0.7665652632713318, "kl": 0.1191253662109375, "learning_rate": 3.916666666666667e-06, "loss": -0.1533, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019145751371979713, "mask/share_reasoning": 0.7707429528236389, "mask/share_step_conf": 0.17886127531528473, "num_tokens": 19138082.0, "reward": 0.9204329252243042, "reward_std": 0.13016876578330994, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.8981571197509766, "rewards/final_brier_reward_step": 0.624739944934845, "rewards/format_reward_step": 0.95703125, "step": 60 }, { "adv/mean_abs_final_conf": 0.7527413964271545, "adv/mean_abs_reasoning": 0.256498783826828, "adv/mean_abs_step_conf": 0.7378632426261902, "adv/ratio_final_to_reasoning": 2.9346782280860975, "adv/ratio_step_to_reasoning": 2.876673454811971, "adv/std_final_conf": 0.9355496168136597, "adv/std_reasoning": 0.5482407808303833, "adv/std_step_conf": 0.9278741478919983, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.4501372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.001661133603238829, "calib/mean_conf": 0.29496078431372547, "calib/mu_c": 0.29538421052631575, "calib/mu_w": 0.2937230769230769, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.09005518151129291, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2534646946564885, "calib/step_q_c_n": 1048.0, "calib/step_q_gap": 0.0170364708122063, "calib/step_q_w": 0.23642822384428222, "calib/step_q_w_n": 411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 746.8984375, "completions/mean_terminated_length": 752.779541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.06506666666666666, "grad_norm": 1.1591432094573975, "kl": 0.1569061279296875, "learning_rate": 3.88888888888889e-06, "loss": -0.0273, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022772040218114853, "mask/share_reasoning": 0.7806782126426697, "mask/share_step_conf": 0.18873721361160278, "num_tokens": 19433352.0, "reward": 0.9295242428779602, "reward_std": 0.07323986291885376, "rewards/accuracy_reward_step": 0.7421875, "rewards/asymmetric_l2_reward": 0.9137653112411499, "rewards/final_brier_reward_step": 0.5976269245147705, "rewards/format_reward_step": 0.99609375, "step": 61 }, { "adv/mean_abs_final_conf": 0.7567622661590576, "adv/mean_abs_reasoning": 0.4702766537666321, "adv/mean_abs_step_conf": 0.7801758050918579, "adv/ratio_final_to_reasoning": 1.609185274450366, "adv/ratio_step_to_reasoning": 1.6589720090145252, "adv/std_final_conf": 0.9355722069740295, "adv/std_reasoning": 0.7392281889915466, "adv/std_step_conf": 0.9290591478347778, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.87890625, "calib/ece": 0.4027138339920948, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.01961992753623193, "calib/mean_conf": 0.3245588932806325, "calib/mu_c": 0.3299097826086957, "calib/mu_w": 0.31028985507246376, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.10027020870091659, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25070945945945944, "calib/step_q_c_n": 814.0, "calib/step_q_gap": -0.002416977322149738, "calib/step_q_w": 0.2531264367816092, "calib/step_q_w_n": 435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2149.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 821.76953125, "completions/mean_terminated_length": 834.8135375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.06613333333333334, "grad_norm": 0.4029572606086731, "kl": 0.1348876953125, "learning_rate": 3.861111111111112e-06, "loss": -0.0466, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020339785143733025, "mask/share_reasoning": 0.7830666899681091, "mask/share_step_conf": 0.1809685379266739, "num_tokens": 19750805.0, "reward": 0.9473300576210022, "reward_std": 0.0737074613571167, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.9235177040100098, "rewards/final_brier_reward_step": 0.6297361850738525, "rewards/format_reward_step": 0.98828125, "step": 62 }, { "adv/mean_abs_final_conf": 0.7501711845397949, "adv/mean_abs_reasoning": 0.45388489961624146, "adv/mean_abs_step_conf": 0.7591933608055115, "adv/ratio_final_to_reasoning": 1.6527784580938092, "adv/ratio_step_to_reasoning": 1.6726561325292106, "adv/std_final_conf": 0.9358533024787903, "adv/std_reasoning": 0.7205963730812073, "adv/std_step_conf": 0.9303606748580933, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.3592694444444444, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0027265445579429115, "calib/mean_conf": 0.30398452380952384, "calib/mu_c": 0.30490419161676646, "calib/mu_w": 0.30217764705882355, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0002777777777777789, "calib/std_conf": 0.09132182212919027, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.23530800821355236, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.01567386187208894, "calib/step_q_w": 0.21963414634146342, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 901.65234375, "completions/mean_terminated_length": 912.3439331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.0672, "grad_norm": 0.46885189414024353, "kl": 0.12359619140625, "learning_rate": 3.833333333333334e-06, "loss": -0.0807, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01950598508119583, "mask/share_reasoning": 0.7906703352928162, "mask/share_step_conf": 0.1781049370765686, "num_tokens": 20090268.0, "reward": 0.9286515712738037, "reward_std": 0.104106605052948, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.9035576581954956, "rewards/final_brier_reward_step": 0.6271829605102539, "rewards/format_reward_step": 0.98046875, "step": 63 }, { "adv/mean_abs_final_conf": 0.7610681653022766, "adv/mean_abs_reasoning": 0.43136951327323914, "adv/mean_abs_step_conf": 0.7791768312454224, "adv/ratio_final_to_reasoning": 1.7643067993546382, "adv/ratio_step_to_reasoning": 1.8062862749224335, "adv/std_final_conf": 0.9357919096946716, "adv/std_reasoning": 0.7204633951187134, "adv/std_step_conf": 0.9319607019424438, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.42578125, "calib/ece": 0.4216220472440945, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01048192194109665, "calib/mean_conf": 0.32341732283464564, "calib/mu_c": 0.3206524064171123, "calib/mu_w": 0.33113432835820894, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004409448818897638, "calib/std_conf": 0.09597604203550639, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2508863298662704, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": -0.0380524088724683, "calib/step_q_w": 0.2889387387387387, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 778.67578125, "completions/mean_terminated_length": 784.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.06826666666666667, "grad_norm": 0.3983413279056549, "kl": 0.1300201416015625, "learning_rate": 3.8055555555555556e-06, "loss": -0.0801, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021179143339395523, "mask/share_reasoning": 0.7854925394058228, "mask/share_step_conf": 0.18551582098007202, "num_tokens": 20393385.0, "reward": 0.932783842086792, "reward_std": 0.08730573207139969, "rewards/accuracy_reward_step": 0.73046875, "rewards/asymmetric_l2_reward": 0.9037855863571167, "rewards/final_brier_reward_step": 0.6172508597373962, "rewards/format_reward_step": 0.9921875, "step": 64 }, { "adv/mean_abs_final_conf": 0.7611038684844971, "adv/mean_abs_reasoning": 0.2436269223690033, "adv/mean_abs_step_conf": 0.773918628692627, "adv/ratio_final_to_reasoning": 3.12405485027517, "adv/ratio_step_to_reasoning": 3.176654784976641, "adv/std_final_conf": 0.9340925216674805, "adv/std_reasoning": 0.5227276682853699, "adv/std_step_conf": 0.9306238889694214, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.40625, "calib/ece": 0.30270980392156865, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.005628198186002464, "calib/mean_conf": 0.34827058823529417, "calib/mu_c": 0.35023493975903613, "calib/mu_w": 0.34460674157303367, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.08010675057627338, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2664497767857143, "calib/step_q_c_n": 1344.0, "calib/step_q_gap": -0.012089827174681766, "calib/step_q_w": 0.27853960396039606, "calib/step_q_w_n": 808.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 706.8359375, "completions/mean_terminated_length": 712.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.06933333333333333, "grad_norm": 0.33293867111206055, "kl": 0.1207275390625, "learning_rate": 3.777777777777778e-06, "loss": -0.0213, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023217733949422836, "mask/share_reasoning": 0.7761552333831787, "mask/share_step_conf": 0.19281449913978577, "num_tokens": 20679359.0, "reward": 0.9603981375694275, "reward_std": 0.056373294442892075, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.9172334671020508, "rewards/final_brier_reward_step": 0.6746565103530884, "rewards/format_reward_step": 0.99609375, "step": 65 }, { "adv/mean_abs_final_conf": 0.7603933811187744, "adv/mean_abs_reasoning": 0.3486219048500061, "adv/mean_abs_step_conf": 0.7570275068283081, "adv/ratio_final_to_reasoning": 2.1811405724661284, "adv/ratio_step_to_reasoning": 2.1714857738328797, "adv/std_final_conf": 0.9345870018005371, "adv/std_reasoning": 0.618615984916687, "adv/std_step_conf": 0.9317054152488708, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 12.4296875, "calib/ece": 0.1384, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.017334869431643696, "calib/mean_conf": 0.36335999999999996, "calib/mu_c": 0.3720967741935484, "calib/mu_w": 0.3547619047619047, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0028799999999999997, "calib/std_conf": 0.09241596398891264, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2951803751803752, "calib/step_q_c_n": 1386.0, "calib/step_q_gap": 0.009495519946522257, "calib/step_q_w": 0.28568485523385295, "calib/step_q_w_n": 1796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 865.55078125, "completions/mean_terminated_length": 886.3240356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.0704, "grad_norm": 0.215412899851799, "kl": 0.097991943359375, "learning_rate": 3.7500000000000005e-06, "loss": -0.1417, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01876765489578247, "mask/share_reasoning": 0.7747286558151245, "mask/share_step_conf": 0.1830662488937378, "num_tokens": 21007292.0, "reward": 0.9492697715759277, "reward_std": 0.08717604726552963, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.8946832418441772, "rewards/final_brier_reward_step": 0.7124500274658203, "rewards/format_reward_step": 0.97265625, "step": 66 }, { "adv/mean_abs_final_conf": 0.7286356687545776, "adv/mean_abs_reasoning": 0.3127225935459137, "adv/mean_abs_step_conf": 0.7489851713180542, "adv/ratio_final_to_reasoning": 2.3299745006994512, "adv/ratio_step_to_reasoning": 2.3950465581185734, "adv/std_final_conf": 0.9348448514938354, "adv/std_reasoning": 0.618545413017273, "adv/std_step_conf": 0.9319062829017639, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.15625, "calib/ece": 0.3642857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.026106893410264176, "calib/mean_conf": 0.35912698412698413, "calib/mu_c": 0.35146067415730337, "calib/mu_w": 0.37756756756756754, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.008531746031746031, "calib/std_conf": 0.08946951100146361, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2827706057596822, "calib/step_q_c_n": 2014.0, "calib/step_q_gap": -0.03709278221845985, "calib/step_q_w": 0.31986338797814207, "calib/step_q_w_n": 1098.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2205.0, "completions/max_terminated_length": 2205.0, "completions/mean_length": 831.875, "completions/mean_terminated_length": 845.0794067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.07146666666666666, "grad_norm": 0.20705059170722961, "kl": 0.1094512939453125, "learning_rate": 3.7222222222222225e-06, "loss": -0.1569, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0199112705886364, "mask/share_reasoning": 0.7821078300476074, "mask/share_step_conf": 0.1823558807373047, "num_tokens": 21325260.0, "reward": 0.9343211054801941, "reward_std": 0.08339205384254456, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8897289633750916, "rewards/final_brier_reward_step": 0.6429757475852966, "rewards/format_reward_step": 0.984375, "step": 67 }, { "adv/mean_abs_final_conf": 0.7309602499008179, "adv/mean_abs_reasoning": 0.38961371779441833, "adv/mean_abs_step_conf": 0.7417314648628235, "adv/ratio_final_to_reasoning": 1.8761152816660136, "adv/ratio_step_to_reasoning": 1.9037611639079963, "adv/std_final_conf": 0.9348047971725464, "adv/std_reasoning": 0.6815693974494934, "adv/std_step_conf": 0.9303959012031555, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.24609375, "calib/ece": 0.25959999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02750268528464017, "calib/mean_conf": 0.36736, "calib/mu_c": 0.35657894736842105, "calib/mu_w": 0.3840816326530612, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.009479999999999999, "calib/std_conf": 0.07379044924649801, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.284954233409611, "calib/step_q_c_n": 1748.0, "calib/step_q_gap": -0.039020203595866776, "calib/step_q_w": 0.3239744370054778, "calib/step_q_w_n": 1643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 786.79296875, "completions/mean_terminated_length": 802.4661865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.07253333333333334, "grad_norm": 0.17438991367816925, "kl": 0.1012115478515625, "learning_rate": 3.694444444444445e-06, "loss": -0.0948, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02090483158826828, "mask/share_reasoning": 0.760016918182373, "mask/share_step_conf": 0.1995469331741333, "num_tokens": 21630767.0, "reward": 0.9380624294281006, "reward_std": 0.09444180876016617, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.8953513503074646, "rewards/final_brier_reward_step": 0.667492151260376, "rewards/format_reward_step": 0.97265625, "step": 68 }, { "adv/mean_abs_final_conf": 0.7689613103866577, "adv/mean_abs_reasoning": 0.5451756119728088, "adv/mean_abs_step_conf": 0.7415592074394226, "adv/ratio_final_to_reasoning": 1.4104836927756967, "adv/ratio_step_to_reasoning": 1.3602208006993692, "adv/std_final_conf": 0.9351477026939392, "adv/std_reasoning": 0.7753952145576477, "adv/std_step_conf": 0.929383397102356, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.21875, "calib/ece": 0.24432539682539678, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02189115646258505, "calib/mean_conf": 0.38551587301587303, "calib/mu_c": 0.37639455782312925, "calib/mu_w": 0.3982857142857143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023253968253968246, "calib/std_conf": 0.08432817922580464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2999098243948743, "calib/step_q_c_n": 2107.0, "calib/step_q_gap": -0.032326061574941334, "calib/step_q_w": 0.3322358859698156, "calib/step_q_w_n": 1789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 876.90625, "completions/mean_terminated_length": 890.825439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.0736, "grad_norm": 0.2321401983499527, "kl": 0.0926361083984375, "learning_rate": 3.6666666666666666e-06, "loss": -0.1673, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018184015527367592, "mask/share_reasoning": 0.7876362204551697, "mask/share_step_conf": 0.17855477333068848, "num_tokens": 21959751.0, "reward": 0.9506509304046631, "reward_std": 0.08560098707675934, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.9004615545272827, "rewards/final_brier_reward_step": 0.6891214847564697, "rewards/format_reward_step": 0.984375, "step": 69 }, { "adv/mean_abs_final_conf": 0.7773220539093018, "adv/mean_abs_reasoning": 0.31656327843666077, "adv/mean_abs_step_conf": 0.7578777074813843, "adv/ratio_final_to_reasoning": 2.455502917925559, "adv/ratio_step_to_reasoning": 2.3940796646539138, "adv/std_final_conf": 0.9348402619361877, "adv/std_reasoning": 0.6185460090637207, "adv/std_step_conf": 0.9301038384437561, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.96875, "calib/ece": 0.30260000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.023262032085561324, "calib/mean_conf": 0.397, "calib/mu_c": 0.3890909090909092, "calib/mu_w": 0.41235294117647053, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01980000000000001, "calib/std_conf": 0.0848068393468357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3166220437304774, "calib/step_q_c_n": 2241.0, "calib/step_q_gap": -0.006734335904972022, "calib/step_q_w": 0.32335637963544944, "calib/step_q_w_n": 1591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 873.046875, "completions/mean_terminated_length": 890.4382934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 0.07466666666666667, "grad_norm": 0.18580324947834015, "kl": 0.0870819091796875, "learning_rate": 3.638888888888889e-06, "loss": -0.1155, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01974579505622387, "mask/share_reasoning": 0.7681633234024048, "mask/share_step_conf": 0.1925596296787262, "num_tokens": 22290243.0, "reward": 0.9446487426757812, "reward_std": 0.08830288797616959, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8924236297607422, "rewards/final_brier_reward_step": 0.6726551055908203, "rewards/format_reward_step": 0.9765625, "step": 70 }, { "adv/mean_abs_final_conf": 0.7830674052238464, "adv/mean_abs_reasoning": 0.5309762954711914, "adv/mean_abs_step_conf": 0.7585134506225586, "adv/ratio_final_to_reasoning": 1.4747690469476569, "adv/ratio_step_to_reasoning": 1.428526013481354, "adv/std_final_conf": 0.9354208707809448, "adv/std_reasoning": 0.7754674553871155, "adv/std_step_conf": 0.9312028288841248, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.078125, "calib/ece": 0.28405622489959836, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.018561089825457677, "calib/mean_conf": 0.41401606425702814, "calib/mu_c": 0.40753086419753093, "calib/mu_w": 0.4260919540229886, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023734939759036143, "calib/std_conf": 0.09730472022096878, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34493478260869564, "calib/step_q_c_n": 2300.0, "calib/step_q_gap": -0.016481884057971075, "calib/step_q_w": 0.3614166666666667, "calib/step_q_w_n": 1560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 848.45703125, "completions/mean_terminated_length": 865.3585815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.07573333333333333, "grad_norm": 71.88875579833984, "kl": 0.3347930908203125, "learning_rate": 3.6111111111111115e-06, "loss": -0.1094, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020054321736097336, "mask/share_reasoning": 0.7716023325920105, "mask/share_step_conf": 0.18881210684776306, "num_tokens": 22611856.0, "reward": 0.9372549057006836, "reward_std": 0.11026224493980408, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.8770648241043091, "rewards/final_brier_reward_step": 0.6763511896133423, "rewards/format_reward_step": 0.96875, "step": 71 }, { "adv/mean_abs_final_conf": 0.776190996170044, "adv/mean_abs_reasoning": 0.27835196256637573, "adv/mean_abs_step_conf": 0.7500138282775879, "adv/ratio_final_to_reasoning": 2.7885235261632246, "adv/ratio_step_to_reasoning": 2.694480115615279, "adv/std_final_conf": 0.9342138767242432, "adv/std_reasoning": 0.5483442544937134, "adv/std_step_conf": 0.9271004796028137, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.5625, "calib/ece": 0.19615686274509814, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0030857142857141917, "calib/mean_conf": 0.4248627450980392, "calib/mu_c": 0.42613333333333325, "calib/mu_w": 0.42304761904761906, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016392156862745096, "calib/std_conf": 0.07119071860849452, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3473727581192439, "calib/step_q_c_n": 2063.0, "calib/step_q_gap": -0.006837452090966312, "calib/step_q_w": 0.3542102102102102, "calib/step_q_w_n": 1665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 737.3671875, "completions/mean_terminated_length": 743.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.0768, "grad_norm": 0.280191570520401, "kl": 0.0969085693359375, "learning_rate": 3.5833333333333335e-06, "loss": -0.0198, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02061384916305542, "mask/share_reasoning": 0.7627760171890259, "mask/share_step_conf": 0.20879769325256348, "num_tokens": 22905030.0, "reward": 0.9801347255706787, "reward_std": 0.046480339020490646, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.9191835522651672, "rewards/final_brier_reward_step": 0.724679708480835, "rewards/format_reward_step": 0.99609375, "step": 72 }, { "adv/mean_abs_final_conf": 0.7884075045585632, "adv/mean_abs_reasoning": 0.3293514549732208, "adv/mean_abs_step_conf": 0.7535132169723511, "adv/ratio_final_to_reasoning": 2.393818192248969, "adv/ratio_step_to_reasoning": 2.287869707554862, "adv/std_final_conf": 0.9344567060470581, "adv/std_reasoning": 0.5961430668830872, "adv/std_step_conf": 0.9287940263748169, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.10546875, "calib/ece": 0.4356126482213439, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.004820524620340605, "calib/mean_conf": 0.40233201581027667, "calib/mu_c": 0.40311320754716984, "calib/mu_w": 0.39829268292682923, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.06555099727511851, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32879655172413796, "calib/step_q_c_n": 2900.0, "calib/step_q_gap": -0.0456338280226975, "calib/step_q_w": 0.37443037974683546, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 751.0703125, "completions/mean_terminated_length": 759.976318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.07786666666666667, "grad_norm": 0.2157929539680481, "kl": 0.0982208251953125, "learning_rate": 3.555555555555556e-06, "loss": -0.0738, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02034447342157364, "mask/share_reasoning": 0.7735206484794617, "mask/share_step_conf": 0.19441615045070648, "num_tokens": 23204336.0, "reward": 0.9625263810157776, "reward_std": 0.06073135510087013, "rewards/accuracy_reward_step": 0.828125, "rewards/asymmetric_l2_reward": 0.8981797099113464, "rewards/final_brier_reward_step": 0.6635918617248535, "rewards/format_reward_step": 0.98828125, "step": 73 }, { "adv/mean_abs_final_conf": 0.7441131472587585, "adv/mean_abs_reasoning": 0.42664575576782227, "adv/mean_abs_step_conf": 0.7311651110649109, "adv/ratio_final_to_reasoning": 1.7441006671204293, "adv/ratio_step_to_reasoning": 1.7137522199161546, "adv/std_final_conf": 0.9347469210624695, "adv/std_reasoning": 0.7204641699790955, "adv/std_step_conf": 0.9290632009506226, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.78515625, "calib/ece": 0.2318253968253967, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.04628135792011473, "calib/mean_conf": 0.4142857142857143, "calib/mu_c": 0.3953691275167785, "calib/mu_w": 0.4416504854368932, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027420634920634936, "calib/std_conf": 0.08214320220769483, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32161306284017815, "calib/step_q_c_n": 2021.0, "calib/step_q_gap": -0.04110475894200011, "calib/step_q_w": 0.36271782178217826, "calib/step_q_w_n": 2020.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 837.6328125, "completions/mean_terminated_length": 847.5652465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.07893333333333333, "grad_norm": 0.20430006086826324, "kl": 0.097412109375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0947, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020939674228429794, "mask/share_reasoning": 0.7661980986595154, "mask/share_step_conf": 0.20114345848560333, "num_tokens": 23522698.0, "reward": 0.9546435475349426, "reward_std": 0.07927078753709793, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.9090206623077393, "rewards/final_brier_reward_step": 0.6869851350784302, "rewards/format_reward_step": 0.984375, "step": 74 }, { "adv/mean_abs_final_conf": 0.7717742919921875, "adv/mean_abs_reasoning": 0.302400678396225, "adv/mean_abs_step_conf": 0.749823808670044, "adv/ratio_final_to_reasoning": 2.552157938551179, "adv/ratio_step_to_reasoning": 2.4795705242683885, "adv/std_final_conf": 0.9339922070503235, "adv/std_reasoning": 0.595943033695221, "adv/std_step_conf": 0.9266640543937683, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 14.125, "calib/ece": 0.42546875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.017243589743589638, "calib/mean_conf": 0.39265625, "calib/mu_c": 0.3894230769230769, "calib/mu_w": 0.40666666666666657, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0028125, "calib/std_conf": 0.07465529509644643, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3184916990462734, "calib/step_q_c_n": 2831.0, "calib/step_q_gap": -0.015164351909140616, "calib/step_q_w": 0.333656050955414, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 751.953125, "completions/mean_terminated_length": 757.8740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.08, "grad_norm": 0.31411486864089966, "kl": 0.1016693115234375, "learning_rate": 3.5e-06, "loss": 0.0006, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02108791470527649, "mask/share_reasoning": 0.7715064883232117, "mask/share_step_conf": 0.19959309697151184, "num_tokens": 23819950.0, "reward": 0.9618932008743286, "reward_std": 0.03922698646783829, "rewards/accuracy_reward_step": 0.8125, "rewards/asymmetric_l2_reward": 0.900726318359375, "rewards/final_brier_reward_step": 0.660560131072998, "rewards/format_reward_step": 1.0, "step": 75 }, { "adv/mean_abs_final_conf": 0.7741357088088989, "adv/mean_abs_reasoning": 0.3422073721885681, "adv/mean_abs_step_conf": 0.7473089694976807, "adv/ratio_final_to_reasoning": 2.262183026209977, "adv/ratio_step_to_reasoning": 2.1837898018336306, "adv/std_final_conf": 0.9345347285270691, "adv/std_reasoning": 0.6186551451683044, "adv/std_step_conf": 0.9287329316139221, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.76953125, "calib/ece": 0.23940711462450598, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0012647300630310165, "calib/mean_conf": 0.41007905138339923, "calib/mu_c": 0.4096341463414634, "calib/mu_w": 0.4108988764044944, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0006324110671936743, "calib/std_conf": 0.07286889030171712, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32993052540164997, "calib/step_q_c_n": 2303.0, "calib/step_q_gap": -0.009900327101732997, "calib/step_q_w": 0.33983085250338296, "calib/step_q_w_n": 1478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 804.29296875, "completions/mean_terminated_length": 813.830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.08106666666666666, "grad_norm": 0.18030913174152374, "kl": 0.0911865234375, "learning_rate": 3.4722222222222224e-06, "loss": -0.0913, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020517395809292793, "mask/share_reasoning": 0.7699077129364014, "mask/share_step_conf": 0.1978561282157898, "num_tokens": 24128905.0, "reward": 0.9599854946136475, "reward_std": 0.07851266860961914, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.8973936438560486, "rewards/final_brier_reward_step": 0.6975773572921753, "rewards/format_reward_step": 0.984375, "step": 76 }, { "adv/mean_abs_final_conf": 0.7624342441558838, "adv/mean_abs_reasoning": 0.3686603307723999, "adv/mean_abs_step_conf": 0.7488216161727905, "adv/ratio_final_to_reasoning": 2.0681211958945167, "adv/ratio_step_to_reasoning": 2.0311966156051953, "adv/std_final_conf": 0.9349859952926636, "adv/std_reasoning": 0.6404635310173035, "adv/std_step_conf": 0.9294195771217346, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.3203125, "calib/ece": 0.3990763052208835, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.012093877551020427, "calib/mean_conf": 0.41497991967871484, "calib/mu_c": 0.41259999999999997, "calib/mu_w": 0.4246938775510204, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.005421686746987952, "calib/std_conf": 0.08229208679203565, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34, "calib/step_q_c_n": 2810.0, "calib/step_q_gap": -0.058444244604316575, "calib/step_q_w": 0.3984442446043166, "calib/step_q_w_n": 1112.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 783.70703125, "completions/mean_terminated_length": 805.7389526367188, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.08213333333333334, "grad_norm": 0.23246468603610992, "kl": 0.0965423583984375, "learning_rate": 3.444444444444445e-06, "loss": -0.1831, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021100902929902077, "mask/share_reasoning": 0.7538150548934937, "mask/share_step_conf": 0.1977403163909912, "num_tokens": 24434198.0, "reward": 0.9426138401031494, "reward_std": 0.09969314932823181, "rewards/accuracy_reward_step": 0.78125, "rewards/asymmetric_l2_reward": 0.8764042258262634, "rewards/final_brier_reward_step": 0.6588234305381775, "rewards/format_reward_step": 0.96875, "step": 77 }, { "adv/mean_abs_final_conf": 0.783347487449646, "adv/mean_abs_reasoning": 0.33605891466140747, "adv/mean_abs_step_conf": 0.7540993690490723, "adv/ratio_final_to_reasoning": 2.3309826142802947, "adv/ratio_step_to_reasoning": 2.2439499032747188, "adv/std_final_conf": 0.9342900514602661, "adv/std_reasoning": 0.618588387966156, "adv/std_step_conf": 0.9270033836364746, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 15.078125, "calib/ece": 0.26610236220472444, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.000626365109560878, "calib/mean_conf": 0.415, "calib/mu_c": 0.4152046783625731, "calib/mu_w": 0.4145783132530122, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003937007874015748, "calib/std_conf": 0.0683365340777814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3311902834008097, "calib/step_q_c_n": 2470.0, "calib/step_q_gap": -0.006680220196312592, "calib/step_q_w": 0.3378705035971223, "calib/step_q_w_n": 1390.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 870.84765625, "completions/mean_terminated_length": 877.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.0832, "grad_norm": 0.2971472442150116, "kl": 0.08612060546875, "learning_rate": 3.416666666666667e-06, "loss": -0.0654, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01775174029171467, "mask/share_reasoning": 0.7862757444381714, "mask/share_step_conf": 0.1881600171327591, "num_tokens": 24765159.0, "reward": 0.9712415337562561, "reward_std": 0.055430181324481964, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.9070584774017334, "rewards/final_brier_reward_step": 0.7033933401107788, "rewards/format_reward_step": 0.9921875, "step": 78 }, { "adv/mean_abs_final_conf": 0.7488627433776855, "adv/mean_abs_reasoning": 0.45631909370422363, "adv/mean_abs_step_conf": 0.728693962097168, "adv/ratio_final_to_reasoning": 1.6410944746990634, "adv/ratio_step_to_reasoning": 1.5968956200844227, "adv/std_final_conf": 0.9342256784439087, "adv/std_reasoning": 0.7207400798797607, "adv/std_step_conf": 0.928988516330719, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.7109375, "calib/ece": 0.2813147410358565, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.035773249738767054, "calib/mean_conf": 0.41207171314741037, "calib/mu_c": 0.4230459770114943, "calib/mu_w": 0.38727272727272727, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 7.968127490039841e-05, "calib/std_conf": 0.08611866891742435, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3313593037712391, "calib/step_q_c_n": 2413.0, "calib/step_q_gap": -0.0338365572782805, "calib/step_q_w": 0.3651958610495196, "calib/step_q_w_n": 1353.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 882.296875, "completions/mean_terminated_length": 899.87255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.08426666666666667, "grad_norm": 0.2342347800731659, "kl": 0.085540771484375, "learning_rate": 3.3888888888888893e-06, "loss": -0.1641, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017933469265699387, "mask/share_reasoning": 0.7754608988761902, "mask/share_step_conf": 0.18707439303398132, "num_tokens": 25097403.0, "reward": 0.9691455364227295, "reward_std": 0.08597791194915771, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.9041585922241211, "rewards/final_brier_reward_step": 0.7021011710166931, "rewards/format_reward_step": 0.98046875, "step": 79 }, { "adv/mean_abs_final_conf": 0.7630899548530579, "adv/mean_abs_reasoning": 0.4424920678138733, "adv/mean_abs_step_conf": 0.7433421611785889, "adv/ratio_final_to_reasoning": 1.7245279867345296, "adv/ratio_step_to_reasoning": 1.679899404414325, "adv/std_final_conf": 0.934352457523346, "adv/std_reasoning": 0.7204890251159668, "adv/std_step_conf": 0.9272314310073853, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.57421875, "calib/ece": 0.34839215686274516, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0014839572192513195, "calib/mean_conf": 0.3943529411764705, "calib/mu_c": 0.3939572192513369, "calib/mu_w": 0.39544117647058824, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004705882352941178, "calib/std_conf": 0.071352981588337, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.329205298013245, "calib/step_q_c_n": 2718.0, "calib/step_q_gap": 0.0012191183488817203, "calib/step_q_w": 0.3279861796643633, "calib/step_q_w_n": 1013.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 754.65625, "completions/mean_terminated_length": 760.5984497070312, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.08533333333333333, "grad_norm": 0.320715993642807, "kl": 0.0978546142578125, "learning_rate": 3.3611111111111117e-06, "loss": -0.0259, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02104521170258522, "mask/share_reasoning": 0.7671687602996826, "mask/share_step_conf": 0.20397347211837769, "num_tokens": 25392755.0, "reward": 0.968206524848938, "reward_std": 0.048565518110990524, "rewards/accuracy_reward_step": 0.73046875, "rewards/asymmetric_l2_reward": 0.9099067449569702, "rewards/final_brier_reward_step": 0.6811937689781189, "rewards/format_reward_step": 0.99609375, "step": 80 }, { "adv/mean_abs_final_conf": 0.7350978851318359, "adv/mean_abs_reasoning": 0.3190707564353943, "adv/mean_abs_step_conf": 0.7276298999786377, "adv/ratio_final_to_reasoning": 2.3038710703049943, "adv/ratio_step_to_reasoning": 2.280465650025714, "adv/std_final_conf": 0.9339547157287598, "adv/std_reasoning": 0.618726909160614, "adv/std_step_conf": 0.9278249144554138, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.43359375, "calib/ece": 0.40763265306122454, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03755291005291006, "calib/mean_conf": 0.3913877551020409, "calib/mu_c": 0.38280423280423276, "calib/mu_w": 0.4203571428571428, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013795918367346938, "calib/std_conf": 0.06589193884812099, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3046979607541362, "calib/step_q_c_n": 2599.0, "calib/step_q_gap": -0.07689967238195855, "calib/step_q_w": 0.38159763313609474, "calib/step_q_w_n": 1352.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 774.77734375, "completions/mean_terminated_length": 806.2723388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.0864, "grad_norm": 0.2366163432598114, "kl": 0.0977935791015625, "learning_rate": 3.3333333333333333e-06, "loss": -0.2295, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0193870197981596, "mask/share_reasoning": 0.7494618892669678, "mask/share_step_conf": 0.1920885592699051, "num_tokens": 25697346.0, "reward": 0.9178603887557983, "reward_std": 0.11977177113294601, "rewards/accuracy_reward_step": 0.73828125, "rewards/asymmetric_l2_reward": 0.8634313941001892, "rewards/final_brier_reward_step": 0.6332269906997681, "rewards/format_reward_step": 0.95703125, "step": 81 }, { "adv/mean_abs_final_conf": 0.7666101455688477, "adv/mean_abs_reasoning": 0.3856019675731659, "adv/mean_abs_step_conf": 0.7624244093894958, "adv/ratio_final_to_reasoning": 1.9880867060757088, "adv/ratio_step_to_reasoning": 1.9772316365186335, "adv/std_final_conf": 0.9344494938850403, "adv/std_reasoning": 0.661274790763855, "adv/std_step_conf": 0.9273675680160522, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.78125, "calib/ece": 0.31511904761904763, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.015572674418604715, "calib/mean_conf": 0.3892460317460318, "calib/mu_c": 0.3843023255813953, "calib/mu_w": 0.39987500000000004, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010912698412698414, "calib/std_conf": 0.06362578923425258, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3134383088869715, "calib/step_q_c_n": 2318.0, "calib/step_q_gap": -0.018812714305388667, "calib/step_q_w": 0.33225102319236016, "calib/step_q_w_n": 1466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 769.0703125, "completions/mean_terminated_length": 781.27783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.08746666666666666, "grad_norm": 0.684307873249054, "kl": 0.0955047607421875, "learning_rate": 3.3055555555555558e-06, "loss": -0.1431, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020459000021219254, "mask/share_reasoning": 0.7678853273391724, "mask/share_step_conf": 0.19603067636489868, "num_tokens": 25999780.0, "reward": 0.9487217664718628, "reward_std": 0.07312074303627014, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8904172778129578, "rewards/final_brier_reward_step": 0.6757761836051941, "rewards/format_reward_step": 0.984375, "step": 82 }, { "adv/mean_abs_final_conf": 0.7622870206832886, "adv/mean_abs_reasoning": 0.3366669714450836, "adv/mean_abs_step_conf": 0.7469022274017334, "adv/ratio_final_to_reasoning": 2.2642168235609983, "adv/ratio_step_to_reasoning": 2.218519459143222, "adv/std_final_conf": 0.9338651299476624, "adv/std_reasoning": 0.6402016878128052, "adv/std_step_conf": 0.9288440942764282, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.16796875, "calib/ece": 0.2678884462151395, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.04030725482036346, "calib/mean_conf": 0.4127490039840638, "calib/mu_c": 0.39845679012345675, "calib/mu_w": 0.4387640449438202, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.017609561752988043, "calib/std_conf": 0.0699801675733177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3180939947780679, "calib/step_q_c_n": 2298.0, "calib/step_q_gap": -0.02536341847114354, "calib/step_q_w": 0.3434574132492114, "calib/step_q_w_n": 1585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 905.15625, "completions/mean_terminated_length": 923.187255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.08853333333333334, "grad_norm": 0.7267497777938843, "kl": 0.08420562744140625, "learning_rate": 3.277777777777778e-06, "loss": -0.1069, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017786450684070587, "mask/share_reasoning": 0.7786890864372253, "mask/share_step_conf": 0.18399320542812347, "num_tokens": 26338764.0, "reward": 0.9465469121932983, "reward_std": 0.07516686618328094, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.8903204202651978, "rewards/final_brier_reward_step": 0.6801171898841858, "rewards/format_reward_step": 0.98046875, "step": 83 }, { "adv/mean_abs_final_conf": 0.7693314552307129, "adv/mean_abs_reasoning": 0.27710047364234924, "adv/mean_abs_step_conf": 0.7469145059585571, "adv/ratio_final_to_reasoning": 2.7763628301252243, "adv/ratio_step_to_reasoning": 2.6954645589043347, "adv/std_final_conf": 0.9339439272880554, "adv/std_reasoning": 0.5726441144943237, "adv/std_step_conf": 0.9281235337257385, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.97265625, "calib/ece": 0.2960629921259842, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02080586080586072, "calib/mean_conf": 0.42842519685039365, "calib/mu_c": 0.4225274725274726, "calib/mu_w": 0.4433333333333333, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003976377952755906, "calib/std_conf": 0.06965522739681071, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3360793357933579, "calib/step_q_c_n": 2168.0, "calib/step_q_gap": -0.007650065767786929, "calib/step_q_w": 0.34372940156114484, "calib/step_q_w_n": 1153.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 757.12109375, "completions/mean_terminated_length": 763.0827026367188, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.0896, "grad_norm": 1.2036303281784058, "kl": 0.13165283203125, "learning_rate": 3.2500000000000002e-06, "loss": -0.044, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02188435010612011, "mask/share_reasoning": 0.775606632232666, "mask/share_step_conf": 0.1946965456008911, "num_tokens": 26638507.0, "reward": 0.9739149808883667, "reward_std": 0.052599694579839706, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.9121018052101135, "rewards/final_brier_reward_step": 0.695103108882904, "rewards/format_reward_step": 0.9921875, "step": 84 }, { "adv/mean_abs_final_conf": 0.7556276321411133, "adv/mean_abs_reasoning": 0.33100903034210205, "adv/mean_abs_step_conf": 0.7411812543869019, "adv/ratio_final_to_reasoning": 2.2828006576139703, "adv/ratio_step_to_reasoning": 2.2391572025116098, "adv/std_final_conf": 0.9345905184745789, "adv/std_reasoning": 0.6186448931694031, "adv/std_step_conf": 0.929984450340271, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.453125, "calib/ece": 0.23274900398406376, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.028314285714285736, "calib/mean_conf": 0.48525896414342634, "calib/mu_c": 0.47668571428571427, "calib/mu_w": 0.505, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01039840637450199, "calib/std_conf": 0.08286835350886146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3672099087353325, "calib/step_q_c_n": 2301.0, "calib/step_q_gap": -0.02267572385937805, "calib/step_q_w": 0.38988563259471054, "calib/step_q_w_n": 1399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 814.92578125, "completions/mean_terminated_length": 831.1593627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.09066666666666667, "grad_norm": 4.2196760177612305, "kl": 0.1661224365234375, "learning_rate": 3.2222222222222227e-06, "loss": -0.0779, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019442353397607803, "mask/share_reasoning": 0.7699412107467651, "mask/share_step_conf": 0.19108512997627258, "num_tokens": 26954952.0, "reward": 0.9674626588821411, "reward_std": 0.0934857726097107, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.8911292552947998, "rewards/final_brier_reward_step": 0.7109836339950562, "rewards/format_reward_step": 0.98046875, "step": 85 }, { "adv/mean_abs_final_conf": 0.7606679797172546, "adv/mean_abs_reasoning": 0.3904555141925812, "adv/mean_abs_step_conf": 0.7498719692230225, "adv/ratio_final_to_reasoning": 1.9481553008419714, "adv/ratio_step_to_reasoning": 1.9205055171872134, "adv/std_final_conf": 0.9347627758979797, "adv/std_reasoning": 0.6613179445266724, "adv/std_step_conf": 0.9294748306274414, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 14.37890625, "calib/ece": 0.16934426229508195, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.00819672131147541, "calib/gap": -0.005542917696927052, "calib/mean_conf": 0.5259836065573771, "calib/mu_c": 0.5238255033557047, "calib/mu_w": 0.5293684210526317, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.042336065573770496, "calib/std_conf": 0.1003817290871886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39544033350703495, "calib/step_q_c_n": 1919.0, "calib/step_q_gap": -0.028509723246654017, "calib/step_q_w": 0.42395005675368896, "calib/step_q_w_n": 1762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 833.15234375, "completions/mean_terminated_length": 863.5101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.09173333333333333, "grad_norm": 1.0680092573165894, "kl": 0.151611328125, "learning_rate": 3.1944444444444443e-06, "loss": -0.1967, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019768303260207176, "mask/share_reasoning": 0.7644141912460327, "mask/share_step_conf": 0.18066129088401794, "num_tokens": 27273751.0, "reward": 0.9416360855102539, "reward_std": 0.10730893909931183, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8686760663986206, "rewards/final_brier_reward_step": 0.7075648307800293, "rewards/format_reward_step": 0.953125, "step": 86 }, { "adv/mean_abs_final_conf": 0.760604739189148, "adv/mean_abs_reasoning": 0.29362332820892334, "adv/mean_abs_step_conf": 0.7504727840423584, "adv/ratio_final_to_reasoning": 2.5904097737355216, "adv/ratio_step_to_reasoning": 2.555903131471797, "adv/std_final_conf": 0.9347039461135864, "adv/std_reasoning": 0.5960657000541687, "adv/std_step_conf": 0.9290173053741455, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.91796875, "calib/ece": 0.2595983935742972, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.01606425702811245, "calib/gap": -0.04816696375519913, "calib/mean_conf": 0.5505220883534135, "calib/mu_c": 0.5406565656565656, "calib/mu_w": 0.5888235294117647, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007469879518072288, "calib/std_conf": 0.10218166609328025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41037534571315687, "calib/step_q_c_n": 2531.0, "calib/step_q_gap": -0.042957987620176497, "calib/step_q_w": 0.45333333333333337, "calib/step_q_w_n": 1032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 730.1640625, "completions/mean_terminated_length": 750.6907348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.0928, "grad_norm": 0.9558160901069641, "kl": 0.191650390625, "learning_rate": 3.1666666666666667e-06, "loss": -0.181, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021466806530952454, "mask/share_reasoning": 0.760173499584198, "mask/share_step_conf": 0.19101595878601074, "num_tokens": 27566169.0, "reward": 0.9829153418540955, "reward_std": 0.10887178778648376, "rewards/accuracy_reward_step": 0.7734375, "rewards/asymmetric_l2_reward": 0.8860079646110535, "rewards/final_brier_reward_step": 0.7306039333343506, "rewards/format_reward_step": 0.97265625, "step": 87 }, { "adv/mean_abs_final_conf": 0.7627379298210144, "adv/mean_abs_reasoning": 0.35643988847732544, "adv/mean_abs_step_conf": 0.7553523778915405, "adv/ratio_final_to_reasoning": 2.1398781519076118, "adv/ratio_step_to_reasoning": 2.1191578224264638, "adv/std_final_conf": 0.9345097541809082, "adv/std_reasoning": 0.6613301038742065, "adv/std_step_conf": 0.9294958114624023, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.30078125, "calib/ece": 0.16023904382470122, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.022451117318435854, "calib/mean_conf": 0.5797609561752988, "calib/mu_c": 0.5862011173184358, "calib/mu_w": 0.56375, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013426294820717126, "calib/std_conf": 0.09673114787700834, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4212097812097812, "calib/step_q_c_n": 2331.0, "calib/step_q_gap": -0.005941056779045606, "calib/step_q_w": 0.4271508379888268, "calib/step_q_w_n": 1074.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 801.75, "completions/mean_terminated_length": 814.4762573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.09386666666666667, "grad_norm": 1.090047001838684, "kl": 0.185943603515625, "learning_rate": 3.138888888888889e-06, "loss": -0.0473, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01939891092479229, "mask/share_reasoning": 0.7788498997688293, "mask/share_step_conf": 0.1861262321472168, "num_tokens": 27881265.0, "reward": 0.9857221841812134, "reward_std": 0.10476869344711304, "rewards/accuracy_reward_step": 0.703125, "rewards/asymmetric_l2_reward": 0.875947892665863, "rewards/final_brier_reward_step": 0.7595589756965637, "rewards/format_reward_step": 0.9765625, "step": 88 }, { "adv/mean_abs_final_conf": 0.7650504112243652, "adv/mean_abs_reasoning": 0.34696489572525024, "adv/mean_abs_step_conf": 0.7899448275566101, "adv/ratio_final_to_reasoning": 2.2049792951681852, "adv/ratio_step_to_reasoning": 2.2767283874796966, "adv/std_final_conf": 0.9346397519111633, "adv/std_reasoning": 0.6186051368713379, "adv/std_step_conf": 0.929905891418457, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.71484375, "calib/ece": 0.06285714285714293, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": 0.016666666666666607, "calib/mean_conf": 0.611984126984127, "calib/mu_c": 0.6183333333333333, "calib/mu_w": 0.6016666666666667, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02789682539682542, "calib/std_conf": 0.10291019232590765, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44071326676176886, "calib/step_q_c_n": 2103.0, "calib/step_q_gap": 0.004527187216314232, "calib/step_q_w": 0.43618607954545463, "calib/step_q_w_n": 1408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 812.90625, "completions/mean_terminated_length": 829.099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.09493333333333333, "grad_norm": 1.02834153175354, "kl": 0.177886962890625, "learning_rate": 3.1111111111111116e-06, "loss": -0.054, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01996425911784172, "mask/share_reasoning": 0.7786233425140381, "mask/share_step_conf": 0.18188118934631348, "num_tokens": 28198257.0, "reward": 0.9733993411064148, "reward_std": 0.09265284240245819, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.8785526156425476, "rewards/final_brier_reward_step": 0.7494961023330688, "rewards/format_reward_step": 0.984375, "step": 89 }, { "adv/mean_abs_final_conf": 0.7778650522232056, "adv/mean_abs_reasoning": 0.36196985840797424, "adv/mean_abs_step_conf": 0.7522958517074585, "adv/ratio_final_to_reasoning": 2.148977419402911, "adv/ratio_step_to_reasoning": 2.0783383871138517, "adv/std_final_conf": 0.9350411891937256, "adv/std_reasoning": 0.6403955221176147, "adv/std_step_conf": 0.9289729595184326, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.234375, "calib/ece": 0.14472, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.00227550287356304, "calib/mean_conf": 0.6380800000000001, "calib/mu_c": 0.6375520833333334, "calib/mu_w": 0.6398275862068964, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007400000000000004, "calib/std_conf": 0.10838594742862193, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45985131528783835, "calib/step_q_c_n": 2623.0, "calib/step_q_gap": -0.022949860030477, "calib/step_q_w": 0.48280117531831535, "calib/step_q_w_n": 1021.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 792.09765625, "completions/mean_terminated_length": 807.8765258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.096, "grad_norm": 0.9693269729614258, "kl": 0.194427490234375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0982, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02038637176156044, "mask/share_reasoning": 0.7679857015609741, "mask/share_step_conf": 0.19209665060043335, "num_tokens": 28504354.0, "reward": 1.005354642868042, "reward_std": 0.1205814927816391, "rewards/accuracy_reward_step": 0.75, "rewards/asymmetric_l2_reward": 0.8915820121765137, "rewards/final_brier_reward_step": 0.773814857006073, "rewards/format_reward_step": 0.9765625, "step": 90 }, { "adv/mean_abs_final_conf": 0.7417153120040894, "adv/mean_abs_reasoning": 0.379161536693573, "adv/mean_abs_step_conf": 0.7472665309906006, "adv/ratio_final_to_reasoning": 1.9561987180242955, "adv/ratio_step_to_reasoning": 1.970839493654967, "adv/std_final_conf": 0.9348874092102051, "adv/std_reasoning": 0.6612809300422668, "adv/std_step_conf": 0.9288040995597839, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.890625, "calib/ece": 0.15106299212598429, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.003046391752577371, "calib/mean_conf": 0.6491732283464566, "calib/mu_c": 0.6484536082474226, "calib/mu_w": 0.6515, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018228346456692916, "calib/std_conf": 0.10754465064197179, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4396638987742191, "calib/step_q_c_n": 2529.0, "calib/step_q_gap": -0.016197834429286295, "calib/step_q_w": 0.45586173320350537, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 806.68359375, "completions/mean_terminated_length": 813.035400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 410.0, "epoch": 0.09706666666666666, "grad_norm": 2.234196186065674, "kl": 0.1956787109375, "learning_rate": 3.055555555555556e-06, "loss": -0.0424, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01936754770576954, "mask/share_reasoning": 0.7878203988075256, "mask/share_step_conf": 0.1849995106458664, "num_tokens": 28818577.0, "reward": 1.0171236991882324, "reward_std": 0.1014469712972641, "rewards/accuracy_reward_step": 0.7578125, "rewards/asymmetric_l2_reward": 0.8966686725616455, "rewards/final_brier_reward_step": 0.7875785231590271, "rewards/format_reward_step": 0.9921875, "step": 91 }, { "adv/mean_abs_final_conf": 0.77048259973526, "adv/mean_abs_reasoning": 0.29210782051086426, "adv/mean_abs_step_conf": 0.7310396432876587, "adv/ratio_final_to_reasoning": 2.637665086774367, "adv/ratio_step_to_reasoning": 2.502636327946137, "adv/std_final_conf": 0.9352065920829773, "adv/std_reasoning": 0.5960862636566162, "adv/std_step_conf": 0.9303618669509888, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.93359375, "calib/ece": 0.14920634920634918, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06349206349206349, "calib/gap": -0.006017388249758504, "calib/mean_conf": 0.7204761904761905, "calib/mu_c": 0.7190673575129534, "calib/mu_w": 0.7250847457627119, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05190476190476191, "calib/std_conf": 0.1505748621637963, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4541677044416771, "calib/step_q_c_n": 2409.0, "calib/step_q_gap": -0.020045155868744224, "calib/step_q_w": 0.4742128603104213, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 734.9375, "completions/mean_terminated_length": 743.6522216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.09813333333333334, "grad_norm": 2.943143367767334, "kl": 0.217803955078125, "learning_rate": 3.0277777777777776e-06, "loss": -0.0436, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022311579436063766, "mask/share_reasoning": 0.7753206491470337, "mask/share_step_conf": 0.19064897298812866, "num_tokens": 29113441.0, "reward": 1.0131025314331055, "reward_std": 0.1232241839170456, "rewards/accuracy_reward_step": 0.75390625, "rewards/asymmetric_l2_reward": 0.8971550464630127, "rewards/final_brier_reward_step": 0.7813937664031982, "rewards/format_reward_step": 0.984375, "step": 92 }, { "adv/mean_abs_final_conf": 0.7749847173690796, "adv/mean_abs_reasoning": 0.35182300209999084, "adv/mean_abs_step_conf": 0.757448136806488, "adv/ratio_final_to_reasoning": 2.202768757992756, "adv/ratio_step_to_reasoning": 2.1529238630941343, "adv/std_final_conf": 0.9323045015335083, "adv/std_reasoning": 0.6187039613723755, "adv/std_step_conf": 0.9306651949882507, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.6015625, "calib/ece": 0.18919400000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.428, "calib/gap": 0.01721665748682033, "calib/mean_conf": 0.8688060000000001, "calib/mu_c": 0.873695530726257, "calib/mu_w": 0.8564788732394367, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17100000000000004, "calib/std_conf": 0.1471171756254177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47560898876404495, "calib/step_q_c_n": 2225.0, "calib/step_q_gap": -0.02103985769578004, "calib/step_q_w": 0.496648846459825, "calib/step_q_w_n": 1257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 761.16015625, "completions/mean_terminated_length": 779.4280395507812, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.0992, "grad_norm": 2.2209722995758057, "kl": 0.22332763671875, "learning_rate": 3e-06, "loss": -0.0692, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02104385383427143, "mask/share_reasoning": 0.7661647796630859, "mask/share_step_conf": 0.18935386836528778, "num_tokens": 29414074.0, "reward": 0.9805176258087158, "reward_std": 0.1517704725265503, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8849955797195435, "rewards/final_brier_reward_step": 0.740883469581604, "rewards/format_reward_step": 0.9765625, "step": 93 }, { "adv/mean_abs_final_conf": 0.7081332206726074, "adv/mean_abs_reasoning": 0.24093008041381836, "adv/mean_abs_step_conf": 0.7455571889877319, "adv/ratio_final_to_reasoning": 2.939164837600714, "adv/ratio_step_to_reasoning": 3.0944960783110713, "adv/std_final_conf": 0.9138640761375427, "adv/std_reasoning": 0.5483078956604004, "adv/std_step_conf": 0.9311778545379639, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.578125, "calib/ece": 0.29919960000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.824, "calib/gap": 0.009134295900178335, "calib/mean_conf": 0.9134404, "calib/mu_c": 0.9165460606060606, "calib/mu_w": 0.9074117647058823, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27632000000000007, "calib/std_conf": 0.1760431499600027, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5296780684104627, "calib/step_q_c_n": 1988.0, "calib/step_q_gap": -0.01844021115942973, "calib/step_q_w": 0.5481182795698925, "calib/step_q_w_n": 1488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 723.28515625, "completions/mean_terminated_length": 737.6932373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.10026666666666667, "grad_norm": 2.1355016231536865, "kl": 0.225067138671875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0724, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.021978553384542465, "mask/share_reasoning": 0.7751826047897339, "mask/share_step_conf": 0.18330764770507812, "num_tokens": 29707915.0, "reward": 0.9226744174957275, "reward_std": 0.1337907314300537, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8526961803436279, "rewards/final_brier_reward_step": 0.6684337854385376, "rewards/format_reward_step": 0.9765625, "step": 94 }, { "adv/mean_abs_final_conf": 0.7303610444068909, "adv/mean_abs_reasoning": 0.4113221764564514, "adv/mean_abs_step_conf": 0.7311738729476929, "adv/ratio_final_to_reasoning": 1.7756422731664154, "adv/ratio_step_to_reasoning": 1.7776184091185407, "adv/std_final_conf": 0.9019168019294739, "adv/std_reasoning": 0.6817259788513184, "adv/std_step_conf": 0.9332419037818909, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.39453125, "calib/ece": 0.2656541832669323, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9123505976095617, "calib/gap": -0.004021979375095852, "calib/mean_conf": 0.9445450199203188, "calib/mu_c": 0.9433752808988767, "calib/mu_w": 0.9473972602739725, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2505179282868526, "calib/std_conf": 0.1319623223009936, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5370347284060553, "calib/step_q_c_n": 2246.0, "calib/step_q_gap": 0.00967209104341793, "calib/step_q_w": 0.5273626373626373, "calib/step_q_w_n": 1183.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 829.8984375, "completions/mean_terminated_length": 843.0714721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.10133333333333333, "grad_norm": 1.4856945276260376, "kl": 0.2117919921875, "learning_rate": 2.944444444444445e-06, "loss": -0.0383, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019828487187623978, "mask/share_reasoning": 0.7749653458595276, "mask/share_step_conf": 0.18958114087581635, "num_tokens": 30026497.0, "reward": 0.9484317898750305, "reward_std": 0.1924382746219635, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8564839363098145, "rewards/final_brier_reward_step": 0.7052233815193176, "rewards/format_reward_step": 0.98046875, "step": 95 }, { "adv/mean_abs_final_conf": 0.6234108209609985, "adv/mean_abs_reasoning": 0.385886013507843, "adv/mean_abs_step_conf": 0.7564665079116821, "adv/ratio_final_to_reasoning": 1.6155310095174722, "adv/ratio_step_to_reasoning": 1.9603366834551188, "adv/std_final_conf": 0.8285508155822754, "adv/std_reasoning": 0.6815575957298279, "adv/std_step_conf": 0.9312968254089355, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.27734375, "calib/ece": 0.20303149606299234, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9606299212598425, "calib/gap": 0.0030050505050505105, "calib/mean_conf": 0.9648425196850395, "calib/mu_c": 0.9655050505050505, "calib/mu_w": 0.9625, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1941732283464569, "calib/std_conf": 0.08685547097187186, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5228502415458938, "calib/step_q_c_n": 2484.0, "calib/step_q_gap": 0.0038447770650195112, "calib/step_q_w": 0.5190054644808743, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 744.59765625, "completions/mean_terminated_length": 750.4606323242188, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.1024, "grad_norm": 2.950009822845459, "kl": 0.220428466796875, "learning_rate": 2.916666666666667e-06, "loss": 0.0198, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020661910995841026, "mask/share_reasoning": 0.7765408754348755, "mask/share_step_conf": 0.19498467445373535, "num_tokens": 30322930.0, "reward": 1.011261224746704, "reward_std": 0.15859991312026978, "rewards/accuracy_reward_step": 0.7734375, "rewards/asymmetric_l2_reward": 0.8882651329040527, "rewards/final_brier_reward_step": 0.7811324000358582, "rewards/format_reward_step": 0.9921875, "step": 96 }, { "adv/mean_abs_final_conf": 0.5439713597297668, "adv/mean_abs_reasoning": 0.33722078800201416, "adv/mean_abs_step_conf": 0.7473347187042236, "adv/ratio_final_to_reasoning": 1.61310150229089, "adv/ratio_step_to_reasoning": 2.216158508886943, "adv/std_final_conf": 0.7314662337303162, "adv/std_reasoning": 0.6402046084403992, "adv/std_step_conf": 0.9306349158287048, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.515625, "calib/ece": 0.2613779527559056, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0022701863354039897, "calib/mean_conf": 0.9857874015748032, "calib/mu_c": 0.986413043478261, "calib/mu_w": 0.984142857142857, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2613779527559056, "calib/std_conf": 0.011362412077055493, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4757070707070708, "calib/step_q_c_n": 2376.0, "calib/step_q_gap": -0.014016176525401547, "calib/step_q_w": 0.48972324723247235, "calib/step_q_w_n": 1084.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 766.25, "completions/mean_terminated_length": 772.283447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 293.0, "epoch": 0.10346666666666667, "grad_norm": 1.606526494026184, "kl": 0.219146728515625, "learning_rate": 2.888888888888889e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02031584270298481, "mask/share_reasoning": 0.7755470275878906, "mask/share_step_conf": 0.19632458686828613, "num_tokens": 30624162.0, "reward": 0.9770911931991577, "reward_std": 0.13751503825187683, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.8849016427993774, "rewards/final_brier_reward_step": 0.7270933389663696, "rewards/format_reward_step": 0.9921875, "step": 97 }, { "adv/mean_abs_final_conf": 0.47782450914382935, "adv/mean_abs_reasoning": 0.40562379360198975, "adv/mean_abs_step_conf": 0.7651926279067993, "adv/ratio_final_to_reasoning": 1.177999211783629, "adv/ratio_step_to_reasoning": 1.8864589306061008, "adv/std_final_conf": 0.6697249412536621, "adv/std_reasoning": 0.6404144167900085, "adv/std_step_conf": 0.9311217069625854, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.1171875, "calib/ece": 0.31866274509803916, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0004068504594818556, "calib/mean_conf": 0.9892509803921568, "calib/mu_c": 0.9891169590643274, "calib/mu_w": 0.9895238095238093, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31866274509803916, "calib/std_conf": 0.0028955254559893296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46385658067337127, "calib/step_q_c_n": 2287.0, "calib/step_q_gap": -0.010808076447955006, "calib/step_q_w": 0.47466465712132627, "calib/step_q_w_n": 1327.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 814.734375, "completions/mean_terminated_length": 821.1495971679688, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.10453333333333334, "grad_norm": 1.3665584325790405, "kl": 0.205047607421875, "learning_rate": 2.861111111111111e-06, "loss": -0.0026, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01965896040201187, "mask/share_reasoning": 0.7770512104034424, "mask/share_step_conf": 0.19547730684280396, "num_tokens": 30938918.0, "reward": 0.949679970741272, "reward_std": 0.16148099303245544, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8918271064758301, "rewards/final_brier_reward_step": 0.674720287322998, "rewards/format_reward_step": 0.99609375, "step": 98 }, { "adv/mean_abs_final_conf": 0.46672576665878296, "adv/mean_abs_reasoning": 0.45751768350601196, "adv/mean_abs_step_conf": 0.7545075416564941, "adv/ratio_final_to_reasoning": 1.020126179784371, "adv/ratio_step_to_reasoning": 1.649133069293877, "adv/std_final_conf": 0.7218591570854187, "adv/std_reasoning": 0.7206075191497803, "adv/std_step_conf": 0.9330312013626099, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.2890625, "calib/ece": 0.4326482213438736, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.928571428545862e-05, "calib/mean_conf": 0.9899604743083005, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899107142857143, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4326482213438736, "calib/std_conf": 0.0006274509038097849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46883780991735535, "calib/step_q_c_n": 1936.0, "calib/step_q_gap": -0.0033967704668710907, "calib/step_q_w": 0.47223458038422644, "calib/step_q_w_n": 1978.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 909.82421875, "completions/mean_terminated_length": 920.6126708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 227.0, "epoch": 0.1056, "grad_norm": 0.6389037370681763, "kl": 0.1871337890625, "learning_rate": 2.8333333333333335e-06, "loss": -0.0461, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018583280965685844, "mask/share_reasoning": 0.7775055170059204, "mask/share_step_conf": 0.1921924650669098, "num_tokens": 31277633.0, "reward": 0.8673032522201538, "reward_std": 0.18501123785972595, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.8672846555709839, "rewards/final_brier_reward_step": 0.5595093965530396, "rewards/format_reward_step": 0.98828125, "step": 99 }, { "adv/mean_abs_final_conf": 0.30531004071235657, "adv/mean_abs_reasoning": 0.30467742681503296, "adv/mean_abs_step_conf": 0.7334623336791992, "adv/ratio_final_to_reasoning": 1.0020763398980248, "adv/ratio_step_to_reasoning": 2.4073405809760824, "adv/std_final_conf": 0.593397855758667, "adv/std_reasoning": 0.5960379838943481, "adv/std_step_conf": 0.9316920042037964, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.4296875, "calib/ece": 0.3514457831325303, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3514457831325303, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44358939802336034, "calib/step_q_c_n": 2226.0, "calib/step_q_gap": -0.001683223786384469, "calib/step_q_w": 0.4452726218097448, "calib/step_q_w_n": 1724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 913.078125, "completions/mean_terminated_length": 938.7469482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.10666666666666667, "grad_norm": 0.6614947319030762, "kl": 0.202117919921875, "learning_rate": 2.805555555555556e-06, "loss": -0.1633, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01702977530658245, "mask/share_reasoning": 0.7675555944442749, "mask/share_step_conf": 0.1880708634853363, "num_tokens": 31618789.0, "reward": 0.9165194630622864, "reward_std": 0.13026177883148193, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.8862611055374146, "rewards/final_brier_reward_step": 0.6280276775360107, "rewards/format_reward_step": 0.97265625, "step": 100 }, { "adv/mean_abs_final_conf": 0.45379549264907837, "adv/mean_abs_reasoning": 0.44616055488586426, "adv/mean_abs_step_conf": 0.7388205528259277, "adv/ratio_final_to_reasoning": 1.017112534220259, "adv/ratio_step_to_reasoning": 1.6559522009177414, "adv/std_final_conf": 0.7018131613731384, "adv/std_reasoning": 0.7014263272285461, "adv/std_step_conf": 0.9303562045097351, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 16.34765625, "calib/ece": 0.374920634920635, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.374920634920635, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4024644453474198, "calib/step_q_c_n": 2461.0, "calib/step_q_gap": -0.023365021009888853, "calib/step_q_w": 0.4258294663573087, "calib/step_q_w_n": 1724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1000.890625, "completions/mean_terminated_length": 1016.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.10773333333333333, "grad_norm": 0.7661640644073486, "kl": 0.17724609375, "learning_rate": 2.7777777777777783e-06, "loss": -0.081, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.015615017153322697, "mask/share_reasoning": 0.7730352878570557, "mask/share_step_conf": 0.1957247257232666, "num_tokens": 31982009.0, "reward": 0.9121058583259583, "reward_std": 0.16850075125694275, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8932945132255554, "rewards/final_brier_reward_step": 0.6129484176635742, "rewards/format_reward_step": 0.984375, "step": 101 }, { "adv/mean_abs_final_conf": 0.3123232126235962, "adv/mean_abs_reasoning": 0.29868799448013306, "adv/mean_abs_step_conf": 0.7256982922554016, "adv/ratio_final_to_reasoning": 1.0456503722795931, "adv/ratio_step_to_reasoning": 2.429619889873648, "adv/std_final_conf": 0.596247136592865, "adv/std_reasoning": 0.5963002443313599, "adv/std_step_conf": 0.9300379753112793, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 16.14453125, "calib/ece": 0.28508196721311474, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28508196721311474, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.36167436005035664, "calib/step_q_c_n": 2383.0, "calib/step_q_gap": -0.06760563994964336, "calib/step_q_w": 0.42928, "calib/step_q_w_n": 1750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 788.51171875, "completions/mean_terminated_length": 827.2909545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.1088, "grad_norm": 1.026517629623413, "kl": 0.18798828125, "learning_rate": 2.7500000000000004e-06, "loss": -0.2047, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018615853041410446, "mask/share_reasoning": 0.7346301078796387, "mask/share_step_conf": 0.19987910985946655, "num_tokens": 32290564.0, "reward": 0.9251087307929993, "reward_std": 0.14792901277542114, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8486717343330383, "rewards/final_brier_reward_step": 0.6773269176483154, "rewards/format_reward_step": 0.94921875, "step": 102 }, { "adv/mean_abs_final_conf": 0.4182831645011902, "adv/mean_abs_reasoning": 0.3931826055049896, "adv/mean_abs_step_conf": 0.7291675806045532, "adv/ratio_final_to_reasoning": 1.0638394441787737, "adv/ratio_step_to_reasoning": 1.8545265492303165, "adv/std_final_conf": 0.6994045972824097, "adv/std_reasoning": 0.7015752196311951, "adv/std_step_conf": 0.9330693483352661, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 16.8828125, "calib/ece": 0.30227426160337556, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.521472392644444e-05, "calib/mean_conf": 0.9900379746835444, "calib/mu_c": 0.9900552147239263, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30227426160337556, "calib/std_conf": 0.000583378158065971, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.36750546089995634, "calib/step_q_c_n": 2289.0, "calib/step_q_gap": -0.03341928086098811, "calib/step_q_w": 0.40092474176094445, "calib/step_q_w_n": 2033.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 985.3203125, "completions/mean_terminated_length": 1064.3121337890625, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 0.10986666666666667, "grad_norm": 0.6708924770355225, "kl": 0.1518402099609375, "learning_rate": 2.7222222222222224e-06, "loss": -0.4667, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.014828003942966461, "mask/share_reasoning": 0.731698751449585, "mask/share_step_conf": 0.17925453186035156, "num_tokens": 32647358.0, "reward": 0.8872878551483154, "reward_std": 0.2002764344215393, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.8252140283584595, "rewards/final_brier_reward_step": 0.6384242177009583, "rewards/format_reward_step": 0.91796875, "step": 103 }, { "adv/mean_abs_final_conf": 0.49375733733177185, "adv/mean_abs_reasoning": 0.47710275650024414, "adv/mean_abs_step_conf": 0.7557744979858398, "adv/ratio_final_to_reasoning": 1.0349077438866552, "adv/ratio_step_to_reasoning": 1.5840916609448537, "adv/std_final_conf": 0.7372961640357971, "adv/std_reasoning": 0.7395596504211426, "adv/std_step_conf": 0.9327031373977661, "calib/answer_extract_rate": 0.90625, "calib/avg_num_step_conf": 19.78515625, "calib/ece": 0.41868831168831167, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002727272727270913, "calib/mean_conf": 0.9901168831168831, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9902727272727272, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41868831168831167, "calib/std_conf": 0.0010189633893991196, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3568018648018648, "calib/step_q_c_n": 2145.0, "calib/step_q_gap": -0.07805498451320375, "calib/step_q_w": 0.43485684931506857, "calib/step_q_w_n": 2920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 886.83203125, "completions/mean_terminated_length": 978.5732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.11093333333333333, "grad_norm": 0.7755709290504456, "kl": 0.155731201171875, "learning_rate": 2.6944444444444444e-06, "loss": -0.3994, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.014660969376564026, "mask/share_reasoning": 0.6996749639511108, "mask/share_step_conf": 0.19191399216651917, "num_tokens": 32981067.0, "reward": 0.813389241695404, "reward_std": 0.22139553725719452, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.8201252222061157, "rewards/final_brier_reward_step": 0.5230593681335449, "rewards/format_reward_step": 0.90234375, "step": 104 }, { "adv/mean_abs_final_conf": 0.5229056477546692, "adv/mean_abs_reasoning": 0.5119879245758057, "adv/mean_abs_step_conf": 0.7275201678276062, "adv/ratio_final_to_reasoning": 1.0213241810105367, "adv/ratio_step_to_reasoning": 1.4209713411314813, "adv/std_final_conf": 0.7754999399185181, "adv/std_reasoning": 0.7758030295372009, "adv/std_step_conf": 0.9329676628112793, "calib/answer_extract_rate": 0.90234375, "calib/avg_num_step_conf": 17.55859375, "calib/ece": 0.22039130434782606, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001886792452828745, "calib/mean_conf": 0.9899565217391304, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9898113207547172, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22039130434782606, "calib/std_conf": 0.0006579454761052857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38908143573909687, "calib/step_q_c_n": 2591.0, "calib/step_q_gap": -0.04803857476510476, "calib/step_q_w": 0.43712001050420163, "calib/step_q_w_n": 1904.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 878.91796875, "completions/mean_terminated_length": 969.8405151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.112, "grad_norm": 0.7921664118766785, "kl": 0.1516265869140625, "learning_rate": 2.666666666666667e-06, "loss": -0.5074, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.015202083624899387, "mask/share_reasoning": 0.7083303928375244, "mask/share_step_conf": 0.18271753191947937, "num_tokens": 33311830.0, "reward": 0.9064150452613831, "reward_std": 0.26135513186454773, "rewards/accuracy_reward_step": 0.69140625, "rewards/asymmetric_l2_reward": 0.7993272542953491, "rewards/final_brier_reward_step": 0.6955339908599854, "rewards/format_reward_step": 0.8984375, "step": 105 }, { "adv/mean_abs_final_conf": 0.44278451800346375, "adv/mean_abs_reasoning": 0.44178658723831177, "adv/mean_abs_step_conf": 0.7569739818572998, "adv/ratio_final_to_reasoning": 1.0022588525636105, "adv/ratio_step_to_reasoning": 1.7134381253837554, "adv/std_final_conf": 0.7180078029632568, "adv/std_reasoning": 0.7207124829292297, "adv/std_step_conf": 0.9320871233940125, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 16.7578125, "calib/ece": 0.34983263598326364, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34983263598326364, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41046433770014557, "calib/step_q_c_n": 2061.0, "calib/step_q_gap": -0.028772988455081028, "calib/step_q_w": 0.4392373261552266, "calib/step_q_w_n": 2229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 847.265625, "completions/mean_terminated_length": 903.7500610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.11306666666666666, "grad_norm": 0.9915971755981445, "kl": 0.1571197509765625, "learning_rate": 2.6388888888888893e-06, "loss": -0.3216, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.016094494611024857, "mask/share_reasoning": 0.730865478515625, "mask/share_step_conf": 0.19054001569747925, "num_tokens": 33633314.0, "reward": 0.8769814968109131, "reward_std": 0.20083826780319214, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.8481184244155884, "rewards/final_brier_reward_step": 0.600375771522522, "rewards/format_reward_step": 0.9296875, "step": 106 }, { "adv/mean_abs_final_conf": 0.4825403690338135, "adv/mean_abs_reasoning": 0.4727458655834198, "adv/mean_abs_step_conf": 0.7345182299613953, "adv/ratio_final_to_reasoning": 1.0207183270409064, "adv/ratio_step_to_reasoning": 1.5537274536603718, "adv/std_final_conf": 0.7396191954612732, "adv/std_reasoning": 0.7394405007362366, "adv/std_step_conf": 0.932647168636322, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.2265625, "calib/ece": 0.2947967479674798, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.485380116956097e-05, "calib/mean_conf": 0.9899186991869919, "calib/mu_c": 0.9899415204678361, "calib/mu_w": 0.9898666666666666, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2947967479674798, "calib/std_conf": 0.0008979968306656315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4337848791893999, "calib/step_q_c_n": 2566.0, "calib/step_q_gap": -0.0463566373271167, "calib/step_q_w": 0.4801415165165166, "calib/step_q_w_n": 1332.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 876.30078125, "completions/mean_terminated_length": 908.2307739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.11413333333333334, "grad_norm": 0.40814369916915894, "kl": 0.1380615234375, "learning_rate": 2.6111111111111113e-06, "loss": -0.129, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01719970628619194, "mask/share_reasoning": 0.7444440722465515, "mask/share_step_conf": 0.20319995284080505, "num_tokens": 33962263.0, "reward": 0.9329098463058472, "reward_std": 0.21053895354270935, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8662306070327759, "rewards/final_brier_reward_step": 0.6738077998161316, "rewards/format_reward_step": 0.9609375, "step": 107 }, { "adv/mean_abs_final_conf": 0.3959675133228302, "adv/mean_abs_reasoning": 0.3670308589935303, "adv/mean_abs_step_conf": 0.7363247871398926, "adv/ratio_final_to_reasoning": 1.0788398403574289, "adv/ratio_step_to_reasoning": 2.0061658825065494, "adv/std_final_conf": 0.6727659106254578, "adv/std_reasoning": 0.6612817645072937, "adv/std_step_conf": 0.9305558800697327, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.9140625, "calib/ece": 0.22109756097560973, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0007936507936510129, "calib/mean_conf": 0.989390243902439, "calib/mu_c": 0.9892063492063491, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22109756097560973, "calib/std_conf": 0.004233306087246293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4414230062477031, "calib/step_q_c_n": 2721.0, "calib/step_q_gap": -0.044487562857987994, "calib/step_q_w": 0.4859105691056911, "calib/step_q_w_n": 1353.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 935.88671875, "completions/mean_terminated_length": 969.9878540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.1152, "grad_norm": 1.3296533823013306, "kl": 0.1182861328125, "learning_rate": 2.5833333333333337e-06, "loss": -0.1934, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016218841075897217, "mask/share_reasoning": 0.7538913488388062, "mask/share_step_conf": 0.19473358988761902, "num_tokens": 34305082.0, "reward": 0.9770240783691406, "reward_std": 0.15435056388378143, "rewards/accuracy_reward_step": 0.73828125, "rewards/asymmetric_l2_reward": 0.8715953826904297, "rewards/final_brier_reward_step": 0.7426090240478516, "rewards/format_reward_step": 0.9609375, "step": 108 }, { "adv/mean_abs_final_conf": 0.5219681859016418, "adv/mean_abs_reasoning": 0.4611818194389343, "adv/mean_abs_step_conf": 0.7467175722122192, "adv/ratio_final_to_reasoning": 1.1318056434589272, "adv/ratio_step_to_reasoning": 1.6191392217513316, "adv/std_final_conf": 0.7728863954544067, "adv/std_reasoning": 0.7393205165863037, "adv/std_step_conf": 0.9320031404495239, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 16.83984375, "calib/ece": 0.4016115702479338, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010028169014082433, "calib/mean_conf": 0.9883884297520661, "calib/mu_c": 0.9888028169014083, "calib/mu_w": 0.9878, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4016115702479338, "calib/std_conf": 0.006702499719731377, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43289906832298136, "calib/step_q_c_n": 1932.0, "calib/step_q_gap": -0.0026070266749169124, "calib/step_q_w": 0.4355060949978983, "calib/step_q_w_n": 2379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 908.0703125, "completions/mean_terminated_length": 956.650146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.11626666666666667, "grad_norm": 1.7965545654296875, "kl": 0.132904052734375, "learning_rate": 2.5555555555555557e-06, "loss": -0.1666, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01624187082052231, "mask/share_reasoning": 0.7423617243766785, "mask/share_step_conf": 0.19061516225337982, "num_tokens": 34642148.0, "reward": 0.8518782258033752, "reward_std": 0.19292962551116943, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8452537059783936, "rewards/final_brier_reward_step": 0.5600652098655701, "rewards/format_reward_step": 0.9375, "step": 109 }, { "adv/mean_abs_final_conf": 0.5707782506942749, "adv/mean_abs_reasoning": 0.44305166602134705, "adv/mean_abs_step_conf": 0.7511885762214661, "adv/ratio_final_to_reasoning": 1.2882882392022734, "adv/ratio_step_to_reasoning": 1.6954875330166852, "adv/std_final_conf": 0.777371883392334, "adv/std_reasoning": 0.7013576626777649, "adv/std_step_conf": 0.9295288324356079, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.125, "calib/ece": 0.31266666666666654, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006367329784251474, "calib/mean_conf": 0.9871764705882352, "calib/mu_c": 0.9873837209302324, "calib/mu_w": 0.9867469879518073, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31266666666666654, "calib/std_conf": 0.008108832952834717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3911467770034843, "calib/step_q_c_n": 2296.0, "calib/step_q_gap": -0.0136145866328794, "calib/step_q_w": 0.4047613636363637, "calib/step_q_w_n": 1320.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 853.45703125, "completions/mean_terminated_length": 860.1771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.11733333333333333, "grad_norm": 1.3909984827041626, "kl": 0.1324920654296875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0326, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019083624705672264, "mask/share_reasoning": 0.7716012597084045, "mask/share_step_conf": 0.20150259137153625, "num_tokens": 34965553.0, "reward": 0.9630765318870544, "reward_std": 0.1603553593158722, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.9123197793960571, "rewards/final_brier_reward_step": 0.6802394390106201, "rewards/format_reward_step": 0.99609375, "step": 110 }, { "adv/mean_abs_final_conf": 0.5458108186721802, "adv/mean_abs_reasoning": 0.4226713478565216, "adv/mean_abs_step_conf": 0.7347046732902527, "adv/ratio_final_to_reasoning": 1.2913362153364107, "adv/ratio_step_to_reasoning": 1.738241016373915, "adv/std_final_conf": 0.746563732624054, "adv/std_reasoning": 0.6817759275436401, "adv/std_step_conf": 0.9296801686286926, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.7265625, "calib/ece": 0.24394308943089446, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.007010645604395571, "calib/mean_conf": 0.9837804878048781, "calib/mu_c": 0.9856043956043956, "calib/mu_w": 0.97859375, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24394308943089446, "calib/std_conf": 0.02963080044010616, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36963867781155013, "calib/step_q_c_n": 2632.0, "calib/step_q_gap": -0.008588366664776914, "calib/step_q_w": 0.37822704447632705, "calib/step_q_w_n": 1394.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 852.84765625, "completions/mean_terminated_length": 883.923095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.1184, "grad_norm": 1.9308661222457886, "kl": 0.12811279296875, "learning_rate": 2.5e-06, "loss": -0.0854, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017727062106132507, "mask/share_reasoning": 0.7479400634765625, "mask/share_step_conf": 0.1991766095161438, "num_tokens": 35291290.0, "reward": 0.9627739191055298, "reward_std": 0.17257159948349, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.8706286549568176, "rewards/final_brier_reward_step": 0.7205440998077393, "rewards/format_reward_step": 0.9609375, "step": 111 }, { "adv/mean_abs_final_conf": 0.3836849331855774, "adv/mean_abs_reasoning": 0.30349308252334595, "adv/mean_abs_step_conf": 0.7266433835029602, "adv/ratio_final_to_reasoning": 1.2642295830781012, "adv/ratio_step_to_reasoning": 2.3942667077002118, "adv/std_final_conf": 0.6613947153091431, "adv/std_reasoning": 0.5960959196090698, "adv/std_step_conf": 0.9305977821350098, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 16.65625, "calib/ece": 0.29832000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": 0.006018187239117512, "calib/mean_conf": 0.9863200000000001, "calib/mu_c": 0.9881976744186045, "calib/mu_w": 0.982179487179487, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29832000000000003, "calib/std_conf": 0.014256843970528683, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3896404326743752, "calib/step_q_c_n": 2681.0, "calib/step_q_gap": 0.0033579310950953567, "calib/step_q_w": 0.38628250157927985, "calib/step_q_w_n": 1583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 954.86328125, "completions/mean_terminated_length": 973.884521484375, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.11946666666666667, "grad_norm": 1.359420895576477, "kl": 0.1134490966796875, "learning_rate": 2.4722222222222226e-06, "loss": -0.0681, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01621391251683235, "mask/share_reasoning": 0.7730250358581543, "mask/share_step_conf": 0.19122979044914246, "num_tokens": 35643655.0, "reward": 0.9456712603569031, "reward_std": 0.12339536845684052, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.879301905632019, "rewards/final_brier_reward_step": 0.682353138923645, "rewards/format_reward_step": 0.9765625, "step": 112 }, { "adv/mean_abs_final_conf": 0.4557688534259796, "adv/mean_abs_reasoning": 0.35456550121307373, "adv/mean_abs_step_conf": 0.7351700663566589, "adv/ratio_final_to_reasoning": 1.2854292136901622, "adv/ratio_step_to_reasoning": 2.073439361250387, "adv/std_final_conf": 0.7352324724197388, "adv/std_reasoning": 0.6613417267799377, "adv/std_step_conf": 0.9299939274787903, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.85546875, "calib/ece": 0.32387755102040816, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003104893012122467, "calib/mean_conf": 0.9891836734693877, "calib/mu_c": 0.9890797546012268, "calib/mu_w": 0.989390243902439, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32387755102040816, "calib/std_conf": 0.004255481312319471, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3820414788097385, "calib/step_q_c_n": 2218.0, "calib/step_q_gap": -0.04620132401481336, "calib/step_q_w": 0.42824280282455185, "calib/step_q_w_n": 1841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 810.05078125, "completions/mean_terminated_length": 842.9796142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.12053333333333334, "grad_norm": 1.6558398008346558, "kl": 0.131866455078125, "learning_rate": 2.4444444444444447e-06, "loss": -0.1583, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01881478540599346, "mask/share_reasoning": 0.7420645952224731, "mask/share_step_conf": 0.20005816221237183, "num_tokens": 35956228.0, "reward": 0.9194172620773315, "reward_std": 0.1609131395816803, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.8766982555389404, "rewards/final_brier_reward_step": 0.6433863043785095, "rewards/format_reward_step": 0.95703125, "step": 113 }, { "adv/mean_abs_final_conf": 0.3602434992790222, "adv/mean_abs_reasoning": 0.34155547618865967, "adv/mean_abs_step_conf": 0.7079946398735046, "adv/ratio_final_to_reasoning": 1.0547144589772004, "adv/ratio_step_to_reasoning": 2.072854014152713, "adv/std_final_conf": 0.6267930865287781, "adv/std_reasoning": 0.6186633706092834, "adv/std_step_conf": 0.9312063455581665, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.2578125, "calib/ece": 0.23170634920634925, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00025491374131014766, "calib/mean_conf": 0.9896428571428572, "calib/mu_c": 0.9895811518324606, "calib/mu_w": 0.9898360655737708, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23170634920634925, "calib/std_conf": 0.0028645736703609465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41471044045676997, "calib/step_q_c_n": 2452.0, "calib/step_q_gap": -0.020268190594983027, "calib/step_q_w": 0.434978631051753, "calib/step_q_w_n": 1198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 824.96875, "completions/mean_terminated_length": 834.7510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 289.0, "epoch": 0.1216, "grad_norm": 0.44312945008277893, "kl": 0.1288299560546875, "learning_rate": 2.4166666666666667e-06, "loss": 0.0298, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01984499767422676, "mask/share_reasoning": 0.7663899064064026, "mask/share_step_conf": 0.20204633474349976, "num_tokens": 36272444.0, "reward": 0.9953356981277466, "reward_std": 0.1457144170999527, "rewards/accuracy_reward_step": 0.74609375, "rewards/asymmetric_l2_reward": 0.8937538862228394, "rewards/final_brier_reward_step": 0.7508238554000854, "rewards/format_reward_step": 0.984375, "step": 114 }, { "adv/mean_abs_final_conf": 0.34107381105422974, "adv/mean_abs_reasoning": 0.3319578766822815, "adv/mean_abs_step_conf": 0.7424596548080444, "adv/ratio_final_to_reasoning": 1.0274611178474102, "adv/ratio_step_to_reasoning": 2.2366080366234424, "adv/std_final_conf": 0.640224039554596, "adv/std_reasoning": 0.6403406262397766, "adv/std_step_conf": 0.9308071136474609, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.375, "calib/ece": 0.3594779116465865, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3594779116465865, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43176690079016683, "calib/step_q_c_n": 2278.0, "calib/step_q_gap": -0.021447815735767983, "calib/step_q_w": 0.4532147165259348, "calib/step_q_w_n": 1658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 789.09765625, "completions/mean_terminated_length": 808.0360107421875, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.12266666666666666, "grad_norm": 0.4511660039424896, "kl": 0.1277008056640625, "learning_rate": 2.388888888888889e-06, "loss": -0.1358, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018690839409828186, "mask/share_reasoning": 0.7570353746414185, "mask/share_step_conf": 0.20083630084991455, "num_tokens": 36579717.0, "reward": 0.90870201587677, "reward_std": 0.14805927872657776, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8798450231552124, "rewards/final_brier_reward_step": 0.6203714609146118, "rewards/format_reward_step": 0.97265625, "step": 115 }, { "adv/mean_abs_final_conf": 0.4521879553794861, "adv/mean_abs_reasoning": 0.42638707160949707, "adv/mean_abs_step_conf": 0.7635422945022583, "adv/ratio_final_to_reasoning": 1.0605104739048432, "adv/ratio_step_to_reasoning": 1.7907257169409254, "adv/std_final_conf": 0.7208244204521179, "adv/std_reasoning": 0.7014361023902893, "adv/std_step_conf": 0.9302881956100464, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.34765625, "calib/ece": 0.2927490039840638, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013157894736848252, "calib/mean_conf": 0.9899601593625499, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9898684210526314, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2927490039840638, "calib/std_conf": 0.0006299357888781638, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4466025286448045, "calib/step_q_c_n": 2531.0, "calib/step_q_gap": -0.01276084760698376, "calib/step_q_w": 0.45936337625178825, "calib/step_q_w_n": 1398.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 931.12109375, "completions/mean_terminated_length": 942.162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.12373333333333333, "grad_norm": 0.48412713408470154, "kl": 0.11883544921875, "learning_rate": 2.361111111111111e-06, "loss": -0.0217, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0181062500923872, "mask/share_reasoning": 0.7784036993980408, "mask/share_step_conf": 0.19177131354808807, "num_tokens": 36922604.0, "reward": 0.9590939283370972, "reward_std": 0.17646659910678864, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.8958651423454285, "rewards/final_brier_reward_step": 0.68951016664505, "rewards/format_reward_step": 0.98046875, "step": 116 }, { "adv/mean_abs_final_conf": 0.4019792675971985, "adv/mean_abs_reasoning": 0.3866897225379944, "adv/mean_abs_step_conf": 0.7260673642158508, "adv/ratio_final_to_reasoning": 1.0395395692413363, "adv/ratio_step_to_reasoning": 1.8776484656752437, "adv/std_final_conf": 0.6823669672012329, "adv/std_reasoning": 0.6817616820335388, "adv/std_step_conf": 0.9302095770835876, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.13671875, "calib/ece": 0.40676113360323896, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00041599244875978325, "calib/mean_conf": 0.989757085020243, "calib/mu_c": 0.9899305555555556, "calib/mu_w": 0.9895145631067959, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40676113360323896, "calib/std_conf": 0.0015395265863831224, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4087016428192899, "calib/step_q_c_n": 1887.0, "calib/step_q_gap": -0.012367270661595431, "calib/step_q_w": 0.4210689134808853, "calib/step_q_w_n": 1988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 789.8984375, "completions/mean_terminated_length": 818.68017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.1248, "grad_norm": 0.5243476033210754, "kl": 0.1358184814453125, "learning_rate": 2.3333333333333336e-06, "loss": -0.1619, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019153062254190445, "mask/share_reasoning": 0.7505680322647095, "mask/share_step_conf": 0.19512265920639038, "num_tokens": 37231418.0, "reward": 0.8781992197036743, "reward_std": 0.172608882188797, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.8800956606864929, "rewards/final_brier_reward_step": 0.5708339810371399, "rewards/format_reward_step": 0.96484375, "step": 117 }, { "adv/mean_abs_final_conf": 0.38207024335861206, "adv/mean_abs_reasoning": 0.3419240117073059, "adv/mean_abs_step_conf": 0.7608519792556763, "adv/ratio_final_to_reasoning": 1.1174127299537893, "adv/ratio_step_to_reasoning": 2.225207804086545, "adv/std_final_conf": 0.681808352470398, "adv/std_reasoning": 0.6402933597564697, "adv/std_step_conf": 0.9277486801147461, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.55859375, "calib/ece": 0.3308032128514057, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009325681492110549, "calib/mean_conf": 0.9894377510040161, "calib/mu_c": 0.9897560975609756, "calib/mu_w": 0.9888235294117645, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3308032128514057, "calib/std_conf": 0.0024717562334825916, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3785341841385597, "calib/step_q_c_n": 2194.0, "calib/step_q_gap": -0.01766480971275386, "calib/step_q_w": 0.3961989938513136, "calib/step_q_w_n": 1789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 882.83203125, "completions/mean_terminated_length": 900.4183349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.12586666666666665, "grad_norm": 1.1578727960586548, "kl": 0.126190185546875, "learning_rate": 2.305555555555556e-06, "loss": -0.0723, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018077511340379715, "mask/share_reasoning": 0.7650994658470154, "mask/share_step_conf": 0.1972917914390564, "num_tokens": 37561431.0, "reward": 0.9241103529930115, "reward_std": 0.14115437865257263, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.879349946975708, "rewards/final_brier_reward_step": 0.6477769017219543, "rewards/format_reward_step": 0.96484375, "step": 118 }, { "adv/mean_abs_final_conf": 0.5348653793334961, "adv/mean_abs_reasoning": 0.30639633536338806, "adv/mean_abs_step_conf": 0.7165675759315491, "adv/ratio_final_to_reasoning": 1.7456650671070926, "adv/ratio_step_to_reasoning": 2.3386949947743183, "adv/std_final_conf": 0.717117965221405, "adv/std_reasoning": 0.5961382389068604, "adv/std_step_conf": 0.9285202622413635, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.9453125, "calib/ece": 0.2746428571428572, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003126195760312145, "calib/mean_conf": 0.9849603174603175, "calib/mu_c": 0.9858659217877095, "calib/mu_w": 0.9827397260273973, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2746428571428572, "calib/std_conf": 0.006516037350749447, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3509469668065624, "calib/step_q_c_n": 2621.0, "calib/step_q_gap": -0.0177840393536019, "calib/step_q_w": 0.3687310061601643, "calib/step_q_w_n": 1461.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 888.5703125, "completions/mean_terminated_length": 902.6746826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.12693333333333334, "grad_norm": 0.9960907697677612, "kl": 0.13580322265625, "learning_rate": 2.277777777777778e-06, "loss": -0.1002, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01829800382256508, "mask/share_reasoning": 0.7680920362472534, "mask/share_step_conf": 0.1979849934577942, "num_tokens": 37893969.0, "reward": 0.9754058718681335, "reward_std": 0.12715084850788116, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.9052948951721191, "rewards/final_brier_reward_step": 0.7087980508804321, "rewards/format_reward_step": 0.984375, "step": 119 }, { "adv/mean_abs_final_conf": 0.5333976745605469, "adv/mean_abs_reasoning": 0.3316050171852112, "adv/mean_abs_step_conf": 0.7758373618125916, "adv/ratio_final_to_reasoning": 1.60853318531857, "adv/ratio_step_to_reasoning": 2.3396430138427715, "adv/std_final_conf": 0.7136245369911194, "adv/std_reasoning": 0.618525505065918, "adv/std_step_conf": 0.9269682765007019, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.12109375, "calib/ece": 0.25180000000000025, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004375662670255243, "calib/mean_conf": 0.9838000000000002, "calib/mu_c": 0.9849726775956285, "calib/mu_w": 0.9805970149253732, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25180000000000025, "calib/std_conf": 0.006600000000000006, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3010981725456864, "calib/step_q_c_n": 2353.0, "calib/step_q_gap": 0.009830343702580235, "calib/step_q_w": 0.29126782884310615, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 804.8125, "completions/mean_terminated_length": 824.1280517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.128, "grad_norm": 2.7689688205718994, "kl": 0.1316375732421875, "learning_rate": 2.25e-06, "loss": -0.1339, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019224481657147408, "mask/share_reasoning": 0.756749153137207, "mask/share_step_conf": 0.2005889117717743, "num_tokens": 38206689.0, "reward": 0.985058069229126, "reward_std": 0.12203526496887207, "rewards/accuracy_reward_step": 0.71484375, "rewards/asymmetric_l2_reward": 0.9071336388587952, "rewards/final_brier_reward_step": 0.7247011661529541, "rewards/format_reward_step": 0.9765625, "step": 120 }, { "adv/mean_abs_final_conf": 0.5716239809989929, "adv/mean_abs_reasoning": 0.4545040726661682, "adv/mean_abs_step_conf": 0.7429869771003723, "adv/ratio_final_to_reasoning": 1.257687258214843, "adv/ratio_step_to_reasoning": 1.634720174765285, "adv/std_final_conf": 0.8058938384056091, "adv/std_reasoning": 0.7393964529037476, "adv/std_step_conf": 0.9307540655136108, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.875, "calib/ece": 0.2929435483870969, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004436964504284102, "calib/mean_conf": 0.9864919354838712, "calib/mu_c": 0.9866279069767441, "calib/mu_w": 0.9861842105263157, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2929435483870969, "calib/std_conf": 0.0058364759138209345, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3091131547381048, "calib/step_q_c_n": 2501.0, "calib/step_q_gap": -0.046648841423123644, "calib/step_q_w": 0.35576199616122844, "calib/step_q_w_n": 1563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 873.56640625, "completions/mean_terminated_length": 901.7459106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.12906666666666666, "grad_norm": 0.9633840918540955, "kl": 0.121826171875, "learning_rate": 2.222222222222222e-06, "loss": -0.1673, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018121790140867233, "mask/share_reasoning": 0.7583945989608765, "mask/share_step_conf": 0.1922336220741272, "num_tokens": 38535378.0, "reward": 0.9467383623123169, "reward_std": 0.1940290331840515, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8854833841323853, "rewards/final_brier_reward_step": 0.6798683404922485, "rewards/format_reward_step": 0.96875, "step": 121 }, { "adv/mean_abs_final_conf": 0.3817138075828552, "adv/mean_abs_reasoning": 0.33455291390419006, "adv/mean_abs_step_conf": 0.7632284164428711, "adv/ratio_final_to_reasoning": 1.1409669194875738, "adv/ratio_step_to_reasoning": 2.2813384212861645, "adv/std_final_conf": 0.6592438220977783, "adv/std_reasoning": 0.6185694336891174, "adv/std_step_conf": 0.9289442896842957, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.5703125, "calib/ece": 0.27643724696356287, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0025328104993598277, "calib/mean_conf": 0.9889878542510122, "calib/mu_c": 0.9897159090909091, "calib/mu_w": 0.9871830985915493, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27643724696356287, "calib/std_conf": 0.0030161264019739585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30713074957410563, "calib/step_q_c_n": 2348.0, "calib/step_q_gap": -0.013180605737249718, "calib/step_q_w": 0.32031135531135535, "calib/step_q_w_n": 1638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 826.1796875, "completions/mean_terminated_length": 859.7642211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.13013333333333332, "grad_norm": 0.5878250002861023, "kl": 0.1271820068359375, "learning_rate": 2.1944444444444445e-06, "loss": -0.1411, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017407502979040146, "mask/share_reasoning": 0.7506500482559204, "mask/share_step_conf": 0.19287997484207153, "num_tokens": 38854224.0, "reward": 0.955092191696167, "reward_std": 0.13091230392456055, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.8852320313453674, "rewards/final_brier_reward_step": 0.6944836378097534, "rewards/format_reward_step": 0.96484375, "step": 122 }, { "adv/mean_abs_final_conf": 0.48798710107803345, "adv/mean_abs_reasoning": 0.4532654285430908, "adv/mean_abs_step_conf": 0.7436596155166626, "adv/ratio_final_to_reasoning": 1.0766033991309394, "adv/ratio_step_to_reasoning": 1.6406713785937124, "adv/std_final_conf": 0.7481557130813599, "adv/std_reasoning": 0.7392951250076294, "adv/std_step_conf": 0.9294713735580444, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 16.01171875, "calib/ece": 0.3434274193548388, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006306818181819329, "calib/mean_conf": 0.9885887096774194, "calib/mu_c": 0.9888125000000002, "calib/mu_w": 0.9881818181818183, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3434274193548388, "calib/std_conf": 0.0034815460432395104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33831505693800085, "calib/step_q_c_n": 2371.0, "calib/step_q_gap": 0.014689478234297115, "calib/step_q_w": 0.32362557870370373, "calib/step_q_w_n": 1728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 954.25, "completions/mean_terminated_length": 981.0762939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.1312, "grad_norm": 0.424822062253952, "kl": 0.1156768798828125, "learning_rate": 2.166666666666667e-06, "loss": -0.1854, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0167711041867733, "mask/share_reasoning": 0.7686349153518677, "mask/share_step_conf": 0.1872502863407135, "num_tokens": 39203800.0, "reward": 0.9190748929977417, "reward_std": 0.18219560384750366, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.8864126801490784, "rewards/final_brier_reward_step": 0.6329870820045471, "rewards/format_reward_step": 0.96875, "step": 123 }, { "adv/mean_abs_final_conf": 0.4092365503311157, "adv/mean_abs_reasoning": 0.3890310525894165, "adv/mean_abs_step_conf": 0.7741281390190125, "adv/ratio_final_to_reasoning": 1.0519380075374705, "adv/ratio_step_to_reasoning": 1.9898877836778432, "adv/std_final_conf": 0.6815820932388306, "adv/std_reasoning": 0.6612483859062195, "adv/std_step_conf": 0.9297136664390564, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.859375, "calib/ece": 0.264183266932271, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.0005892658066569689, "calib/mean_conf": 0.9892828685258965, "calib/mu_c": 0.989120879120879, "calib/mu_w": 0.989710144927536, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.264183266932271, "calib/std_conf": 0.005944938700611581, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33208949416342415, "calib/step_q_c_n": 2570.0, "calib/step_q_gap": -0.018117150893301903, "calib/step_q_w": 0.35020664505672605, "calib/step_q_w_n": 1234.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 878.37109375, "completions/mean_terminated_length": 895.8685302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.13226666666666667, "grad_norm": 0.3895443081855774, "kl": 0.1200103759765625, "learning_rate": 2.138888888888889e-06, "loss": -0.0976, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017488375306129456, "mask/share_reasoning": 0.7753483057022095, "mask/share_step_conf": 0.18763208389282227, "num_tokens": 39535479.0, "reward": 0.9722012281417847, "reward_std": 0.14500567317008972, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.8897839784622192, "rewards/final_brier_reward_step": 0.7163370847702026, "rewards/format_reward_step": 0.98046875, "step": 124 }, { "adv/mean_abs_final_conf": 0.5757139325141907, "adv/mean_abs_reasoning": 0.5143795013427734, "adv/mean_abs_step_conf": 0.7276121377944946, "adv/ratio_final_to_reasoning": 1.1192396489582213, "adv/ratio_step_to_reasoning": 1.4145434176421947, "adv/std_final_conf": 0.7775260210037231, "adv/std_reasoning": 0.7394909858703613, "adv/std_step_conf": 0.9310935735702515, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.78125, "calib/ece": 0.3904800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.008214499302279132, "calib/mean_conf": 0.9864800000000001, "calib/mu_c": 0.9897986577181206, "calib/mu_w": 0.9815841584158415, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3904800000000001, "calib/std_conf": 0.04392276858304814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3889052132701422, "calib/step_q_c_n": 2110.0, "calib/step_q_gap": -0.022095384101422866, "calib/step_q_w": 0.4110005973715651, "calib/step_q_w_n": 1674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 896.08984375, "completions/mean_terminated_length": 910.3135375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.13333333333333333, "grad_norm": 1.2234833240509033, "kl": 0.126495361328125, "learning_rate": 2.1111111111111114e-06, "loss": -0.0928, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01819682866334915, "mask/share_reasoning": 0.7786560654640198, "mask/share_step_conf": 0.18752211332321167, "num_tokens": 39869686.0, "reward": 0.8933454155921936, "reward_std": 0.21132420003414154, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8796907663345337, "rewards/final_brier_reward_step": 0.594499945640564, "rewards/format_reward_step": 0.9765625, "step": 125 }, { "adv/mean_abs_final_conf": 0.7509729862213135, "adv/mean_abs_reasoning": 0.5741642713546753, "adv/mean_abs_step_conf": 0.7404218316078186, "adv/ratio_final_to_reasoning": 1.30794099125931, "adv/ratio_step_to_reasoning": 1.2895644479251165, "adv/std_final_conf": 0.9004964232444763, "adv/std_reasoning": 0.8100572228431702, "adv/std_step_conf": 0.9327021837234497, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.2890625, "calib/ece": 0.31004291845493565, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.8154506437768241, "calib/gap": -0.028273393273393177, "calib/mean_conf": 0.9563948497854078, "calib/mu_c": 0.9470512820512821, "calib/mu_w": 0.9753246753246753, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2984549356223176, "calib/std_conf": 0.1042687941462006, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.435239121068505, "calib/step_q_c_n": 2321.0, "calib/step_q_gap": 0.04682104197245979, "calib/step_q_w": 0.3884180790960452, "calib/step_q_w_n": 1593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 907.53515625, "completions/mean_terminated_length": 929.3160400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.1344, "grad_norm": 1.8819005489349365, "kl": 0.116241455078125, "learning_rate": 2.0833333333333334e-06, "loss": -0.1394, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.01702333427965641, "mask/share_reasoning": 0.7705803513526917, "mask/share_step_conf": 0.18895882368087769, "num_tokens": 40207479.0, "reward": 0.8807992935180664, "reward_std": 0.27574867010116577, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.8365055322647095, "rewards/final_brier_reward_step": 0.6125929951667786, "rewards/format_reward_step": 0.91015625, "step": 126 }, { "adv/mean_abs_final_conf": 0.7520188093185425, "adv/mean_abs_reasoning": 0.34918344020843506, "adv/mean_abs_step_conf": 0.7347904443740845, "adv/ratio_final_to_reasoning": 2.1536496944690344, "adv/ratio_step_to_reasoning": 2.1043106853391227, "adv/std_final_conf": 0.9135465621948242, "adv/std_reasoning": 0.6186864376068115, "adv/std_step_conf": 0.931441068649292, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.375, "calib/ece": 0.3068421052631578, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8663967611336032, "calib/gap": 0.010606193397604602, "calib/mean_conf": 0.9667611336032387, "calib/mu_c": 0.9703680981595091, "calib/mu_w": 0.9597619047619045, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3068421052631578, "calib/std_conf": 0.06429338456730833, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4392610837438424, "calib/step_q_c_n": 2233.0, "calib/step_q_gap": -0.007802920953750125, "calib/step_q_w": 0.4470640046975925, "calib/step_q_w_n": 1703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 859.27734375, "completions/mean_terminated_length": 879.9000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.13546666666666668, "grad_norm": 2.104114532470703, "kl": 0.1326751708984375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0213, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018442781642079353, "mask/share_reasoning": 0.7617242932319641, "mask/share_step_conf": 0.19639545679092407, "num_tokens": 40531126.0, "reward": 0.9300879240036011, "reward_std": 0.15767735242843628, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.8817926645278931, "rewards/final_brier_reward_step": 0.6580707430839539, "rewards/format_reward_step": 0.96484375, "step": 127 }, { "adv/mean_abs_final_conf": 0.503220796585083, "adv/mean_abs_reasoning": 0.3909847140312195, "adv/mean_abs_step_conf": 0.7482200860977173, "adv/ratio_final_to_reasoning": 1.287060026967964, "adv/ratio_step_to_reasoning": 1.9136811728091578, "adv/std_final_conf": 0.7600083351135254, "adv/std_reasoning": 0.681631326675415, "adv/std_step_conf": 0.931346595287323, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 14.828125, "calib/ece": 0.338089430894309, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": -0.00016715116279031328, "calib/mean_conf": 0.9884959349593496, "calib/mu_c": 0.9884375000000002, "calib/mu_w": 0.9886046511627905, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.338089430894309, "calib/std_conf": 0.010069574218350011, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4379797742238946, "calib/step_q_c_n": 2126.0, "calib/step_q_gap": 0.031312109553235934, "calib/step_q_w": 0.4066676646706587, "calib/step_q_w_n": 1670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 863.765625, "completions/mean_terminated_length": 895.2388916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.13653333333333334, "grad_norm": 0.8047178387641907, "kl": 0.124481201171875, "learning_rate": 2.027777777777778e-06, "loss": -0.2159, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018535666167736053, "mask/share_reasoning": 0.7565313577651978, "mask/share_step_conf": 0.18977676331996918, "num_tokens": 40858914.0, "reward": 0.9114823341369629, "reward_std": 0.15939074754714966, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.8742043972015381, "rewards/final_brier_reward_step": 0.6323539018630981, "rewards/format_reward_step": 0.95703125, "step": 128 }, { "adv/mean_abs_final_conf": 0.41117343306541443, "adv/mean_abs_reasoning": 0.3632163405418396, "adv/mean_abs_step_conf": 0.7497574687004089, "adv/ratio_final_to_reasoning": 1.132034512687489, "adv/ratio_step_to_reasoning": 2.064217341053363, "adv/std_final_conf": 0.6818158626556396, "adv/std_reasoning": 0.6402585506439209, "adv/std_step_conf": 0.9302917718887329, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.17578125, "calib/ece": 0.2916862745098039, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005369910987889082, "calib/mean_conf": 0.9897254901960784, "calib/mu_c": 0.9898876404494381, "calib/mu_w": 0.9893506493506492, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2916862745098039, "calib/std_conf": 0.004550709625216838, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4760640374853573, "calib/step_q_c_n": 2561.0, "calib/step_q_gap": 0.05347040452655577, "calib/step_q_w": 0.42259363295880153, "calib/step_q_w_n": 1068.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 823.2265625, "completions/mean_terminated_length": 829.7086791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.1376, "grad_norm": 0.9057077169418335, "kl": 0.1287689208984375, "learning_rate": 2.0000000000000003e-06, "loss": -0.0097, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020107831805944443, "mask/share_reasoning": 0.7813020944595337, "mask/share_step_conf": 0.1907775104045868, "num_tokens": 41172044.0, "reward": 0.9572560787200928, "reward_std": 0.14120006561279297, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8801839351654053, "rewards/final_brier_reward_step": 0.6976093649864197, "rewards/format_reward_step": 0.98828125, "step": 129 }, { "adv/mean_abs_final_conf": 0.268592894077301, "adv/mean_abs_reasoning": 0.2511371076107025, "adv/mean_abs_step_conf": 0.7308031320571899, "adv/ratio_final_to_reasoning": 1.069506998120156, "adv/ratio_step_to_reasoning": 2.909976701611283, "adv/std_final_conf": 0.5554208159446716, "adv/std_reasoning": 0.548273503780365, "adv/std_step_conf": 0.9301493763923645, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.16796875, "calib/ece": 0.26284738955823306, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000472213194670168, "calib/mean_conf": 0.9897550200803215, "calib/mu_c": 0.9898839779005524, "calib/mu_w": 0.9894117647058822, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26284738955823306, "calib/std_conf": 0.0025078641169959085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43612049799911073, "calib/step_q_c_n": 2249.0, "calib/step_q_gap": 0.043551557505641936, "calib/step_q_w": 0.3925689404934688, "calib/step_q_w_n": 1378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 754.734375, "completions/mean_terminated_length": 769.7689208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.13866666666666666, "grad_norm": 0.4536561369895935, "kl": 0.12738037109375, "learning_rate": 1.9722222222222224e-06, "loss": -0.1226, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02006208896636963, "mask/share_reasoning": 0.7578706741333008, "mask/share_step_conf": 0.2025359719991684, "num_tokens": 41470544.0, "reward": 0.9651122689247131, "reward_std": 0.10998076945543289, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.8809576630592346, "rewards/final_brier_reward_step": 0.712548017501831, "rewards/format_reward_step": 0.97265625, "step": 130 }, { "adv/mean_abs_final_conf": 0.523271381855011, "adv/mean_abs_reasoning": 0.4472423791885376, "adv/mean_abs_step_conf": 0.7617558836936951, "adv/ratio_final_to_reasoning": 1.1699950769522736, "adv/ratio_step_to_reasoning": 1.7032283145345055, "adv/std_final_conf": 0.7880796194076538, "adv/std_reasoning": 0.7205169796943665, "adv/std_step_conf": 0.9308968186378479, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.48828125, "calib/ece": 0.4696733870967744, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003445378151263334, "calib/mean_conf": 0.989834677419355, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9896554621848738, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4696733870967744, "calib/std_conf": 0.00176687916754823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4019420289855072, "calib/step_q_c_n": 1725.0, "calib/step_q_gap": 0.010111671842649994, "calib/step_q_w": 0.3918303571428572, "calib/step_q_w_n": 2240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 817.875, "completions/mean_terminated_length": 837.5040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.13973333333333332, "grad_norm": 0.7782612442970276, "kl": 0.122802734375, "learning_rate": 1.944444444444445e-06, "loss": -0.0807, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01834723725914955, "mask/share_reasoning": 0.7647630572319031, "mask/share_step_conf": 0.19345220923423767, "num_tokens": 41786128.0, "reward": 0.8445634245872498, "reward_std": 0.1692183017730713, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.8803940415382385, "rewards/final_brier_reward_step": 0.5134202837944031, "rewards/format_reward_step": 0.96875, "step": 131 }, { "adv/mean_abs_final_conf": 0.48913246393203735, "adv/mean_abs_reasoning": 0.4297143816947937, "adv/mean_abs_step_conf": 0.7331827878952026, "adv/ratio_final_to_reasoning": 1.1382734317685592, "adv/ratio_step_to_reasoning": 1.706209564137764, "adv/std_final_conf": 0.7304244041442871, "adv/std_reasoning": 0.7013852596282959, "adv/std_step_conf": 0.9297940731048584, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.3671875, "calib/ece": 0.2546640316205535, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.013282988871224144, "calib/mean_conf": 0.9858893280632413, "calib/mu_c": 0.9894594594594596, "calib/mu_w": 0.9761764705882354, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2546640316205535, "calib/std_conf": 0.055884016720748914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38789877904686887, "calib/step_q_c_n": 2539.0, "calib/step_q_gap": 0.02912792742263709, "calib/step_q_w": 0.3587708516242318, "calib/step_q_w_n": 1139.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 825.703125, "completions/mean_terminated_length": 835.4940795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.1408, "grad_norm": 0.3801063895225525, "kl": 0.1238555908203125, "learning_rate": 1.916666666666667e-06, "loss": -0.0466, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019597802311182022, "mask/share_reasoning": 0.7730059623718262, "mask/share_step_conf": 0.19567744433879852, "num_tokens": 42103100.0, "reward": 0.9892120957374573, "reward_std": 0.16496729850769043, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.9042073488235474, "rewards/final_brier_reward_step": 0.7320293188095093, "rewards/format_reward_step": 0.98828125, "step": 132 }, { "adv/mean_abs_final_conf": 0.6259187459945679, "adv/mean_abs_reasoning": 0.5151533484458923, "adv/mean_abs_step_conf": 0.733622670173645, "adv/ratio_final_to_reasoning": 1.2150144182947293, "adv/ratio_step_to_reasoning": 1.4240859976681273, "adv/std_final_conf": 0.8321493864059448, "adv/std_reasoning": 0.7754168510437012, "adv/std_step_conf": 0.9297448396682739, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 16.3828125, "calib/ece": 0.3794422310756972, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008770174736560943, "calib/mean_conf": 0.989003984063745, "calib/mu_c": 0.9893464052287582, "calib/mu_w": 0.9884693877551021, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3794422310756972, "calib/std_conf": 0.0034864541605034586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34513859722805545, "calib/step_q_c_n": 2381.0, "calib/step_q_gap": -0.0012480547300250944, "calib/step_q_w": 0.34638665195808055, "calib/step_q_w_n": 1813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 977.4296875, "completions/mean_terminated_length": 992.9445190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.14186666666666667, "grad_norm": 0.5285957455635071, "kl": 0.1024322509765625, "learning_rate": 1.888888888888889e-06, "loss": -0.066, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01599453017115593, "mask/share_reasoning": 0.7807644605636597, "mask/share_step_conf": 0.1876160204410553, "num_tokens": 42459666.0, "reward": 0.9146148562431335, "reward_std": 0.19941456615924835, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.9072507619857788, "rewards/final_brier_reward_step": 0.6063538789749146, "rewards/format_reward_step": 0.98046875, "step": 133 }, { "adv/mean_abs_final_conf": 0.5985839366912842, "adv/mean_abs_reasoning": 0.5509366989135742, "adv/mean_abs_step_conf": 0.7408708930015564, "adv/ratio_final_to_reasoning": 1.0864840513831597, "adv/ratio_step_to_reasoning": 1.3447477622429673, "adv/std_final_conf": 0.8353852033615112, "adv/std_reasoning": 0.8097612857818604, "adv/std_step_conf": 0.929698646068573, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 17.0546875, "calib/ece": 0.39272000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0009847830420623982, "calib/mean_conf": 0.98872, "calib/mu_c": 0.9883221476510067, "calib/mu_w": 0.9893069306930691, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39272000000000007, "calib/std_conf": 0.005362984243870202, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3290512174643157, "calib/step_q_c_n": 2382.0, "calib/step_q_gap": 0.0364504110127028, "calib/step_q_w": 0.2926008064516129, "calib/step_q_w_n": 1984.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 954.21484375, "completions/mean_terminated_length": 973.22314453125, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.14293333333333333, "grad_norm": 0.42407187819480896, "kl": 0.10980224609375, "learning_rate": 1.8611111111111113e-06, "loss": -0.1283, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016450483351945877, "mask/share_reasoning": 0.7754415273666382, "mask/share_step_conf": 0.18857669830322266, "num_tokens": 42812897.0, "reward": 0.9058350920677185, "reward_std": 0.1978456974029541, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.909635066986084, "rewards/final_brier_reward_step": 0.5903164148330688, "rewards/format_reward_step": 0.9765625, "step": 134 }, { "adv/mean_abs_final_conf": 0.5336993932723999, "adv/mean_abs_reasoning": 0.39589864015579224, "adv/mean_abs_step_conf": 0.7357176542282104, "adv/ratio_final_to_reasoning": 1.348070791711689, "adv/ratio_step_to_reasoning": 1.8583485256193206, "adv/std_final_conf": 0.7325779795646667, "adv/std_reasoning": 0.6614921689033508, "adv/std_step_conf": 0.9296439290046692, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.1875, "calib/ece": 0.3025000000000002, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.0007993966817498865, "calib/mean_conf": 0.9879838709677421, "calib/mu_c": 0.9882352941176471, "calib/mu_w": 0.9874358974358972, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3025000000000002, "calib/std_conf": 0.007068768053190196, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.26142096642096646, "calib/step_q_c_n": 2442.0, "calib/step_q_gap": -0.006504344782353011, "calib/step_q_w": 0.26792531120331947, "calib/step_q_w_n": 1446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 865.08984375, "completions/mean_terminated_length": 889.4096069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.144, "grad_norm": 1.1292216777801514, "kl": 0.125762939453125, "learning_rate": 1.8333333333333333e-06, "loss": -0.1078, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017884589731693268, "mask/share_reasoning": 0.7671705484390259, "mask/share_step_conf": 0.18760114908218384, "num_tokens": 43140240.0, "reward": 0.9437267780303955, "reward_std": 0.17460128664970398, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.8940476179122925, "rewards/final_brier_reward_step": 0.6676245927810669, "rewards/format_reward_step": 0.96484375, "step": 135 }, { "adv/mean_abs_final_conf": 0.6092220544815063, "adv/mean_abs_reasoning": 0.37975797057151794, "adv/mean_abs_step_conf": 0.7440617084503174, "adv/ratio_final_to_reasoning": 1.6042377031998978, "adv/ratio_step_to_reasoning": 1.9593050471871318, "adv/std_final_conf": 0.802588701248169, "adv/std_reasoning": 0.6613209843635559, "adv/std_step_conf": 0.9287387728691101, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 16.08984375, "calib/ece": 0.3816532258064518, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006108843537415698, "calib/mean_conf": 0.9864919354838712, "calib/mu_c": 0.9867333333333332, "calib/mu_w": 0.9861224489795917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3816532258064518, "calib/std_conf": 0.0073638129671517736, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22906190476190474, "calib/step_q_c_n": 2100.0, "calib/step_q_gap": -0.019957411731408775, "calib/step_q_w": 0.24901931649331352, "calib/step_q_w_n": 2019.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 816.3515625, "completions/mean_terminated_length": 842.6854858398438, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.14506666666666668, "grad_norm": 0.5741930603981018, "kl": 0.1309967041015625, "learning_rate": 1.8055555555555557e-06, "loss": -0.1263, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018834035843610764, "mask/share_reasoning": 0.7524459362030029, "mask/share_step_conf": 0.19746999442577362, "num_tokens": 43457714.0, "reward": 0.9019168615341187, "reward_std": 0.15052568912506104, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8965630531311035, "rewards/final_brier_reward_step": 0.5963331460952759, "rewards/format_reward_step": 0.96875, "step": 136 }, { "adv/mean_abs_final_conf": 0.6180294752120972, "adv/mean_abs_reasoning": 0.35315749049186707, "adv/mean_abs_step_conf": 0.7460911273956299, "adv/ratio_final_to_reasoning": 1.7500109493680127, "adv/ratio_step_to_reasoning": 2.1126300516987384, "adv/std_final_conf": 0.7892508506774902, "adv/std_reasoning": 0.618638277053833, "adv/std_step_conf": 0.9266929626464844, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 17.23828125, "calib/ece": 0.3343902439024391, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.000581395348837499, "calib/mean_conf": 0.9847967479674797, "calib/mu_c": 0.9850000000000001, "calib/mu_w": 0.9844186046511626, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3343902439024391, "calib/std_conf": 0.011814941174324136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2536361175560712, "calib/step_q_c_n": 2586.0, "calib/step_q_gap": -0.020129618623458134, "calib/step_q_w": 0.2737657361795293, "calib/step_q_w_n": 1827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 888.65234375, "completions/mean_terminated_length": 924.7763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.14613333333333334, "grad_norm": 0.5475178360939026, "kl": 0.110626220703125, "learning_rate": 1.777777777777778e-06, "loss": -0.154, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016782626509666443, "mask/share_reasoning": 0.7560588717460632, "mask/share_step_conf": 0.18809598684310913, "num_tokens": 43792193.0, "reward": 0.9182146787643433, "reward_std": 0.1334601640701294, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.8841294050216675, "rewards/final_brier_reward_step": 0.635112464427948, "rewards/format_reward_step": 0.9609375, "step": 137 }, { "adv/mean_abs_final_conf": 0.6299825310707092, "adv/mean_abs_reasoning": 0.3710201680660248, "adv/mean_abs_step_conf": 0.7547677755355835, "adv/ratio_final_to_reasoning": 1.697973817311734, "adv/ratio_step_to_reasoning": 2.0343039017794555, "adv/std_final_conf": 0.8220841884613037, "adv/std_reasoning": 0.6613304615020752, "adv/std_step_conf": 0.9281949400901794, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.64453125, "calib/ece": 0.2651004016064258, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": -0.0022194732641657167, "calib/mean_conf": 0.9839759036144579, "calib/mu_c": 0.9833519553072626, "calib/mu_w": 0.9855714285714283, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2651004016064258, "calib/std_conf": 0.011329999879781574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21672243346007605, "calib/step_q_c_n": 2630.0, "calib/step_q_gap": -0.015786657449014835, "calib/step_q_w": 0.2325090909090909, "calib/step_q_w_n": 1375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 835.83203125, "completions/mean_terminated_length": 859.3292846679688, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.1472, "grad_norm": 0.5974453687667847, "kl": 0.1317901611328125, "learning_rate": 1.75e-06, "loss": -0.1189, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019664429128170013, "mask/share_reasoning": 0.7604337334632874, "mask/share_step_conf": 0.19255805015563965, "num_tokens": 44110502.0, "reward": 0.9590154886245728, "reward_std": 0.15523535013198853, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8769211769104004, "rewards/final_brier_reward_step": 0.7067347764968872, "rewards/format_reward_step": 0.97265625, "step": 138 }, { "adv/mean_abs_final_conf": 0.5524908304214478, "adv/mean_abs_reasoning": 0.3340801000595093, "adv/mean_abs_step_conf": 0.7541294097900391, "adv/ratio_final_to_reasoning": 1.6537675555144806, "adv/ratio_step_to_reasoning": 2.2573311300365, "adv/std_final_conf": 0.7784129977226257, "adv/std_reasoning": 0.6401391625404358, "adv/std_step_conf": 0.9281803965568542, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.7265625, "calib/ece": 0.27868525896414365, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880478087649402, "calib/gap": -0.0007834385100816688, "calib/mean_conf": 0.9806772908366536, "calib/mu_c": 0.9804494382022473, "calib/mu_w": 0.981232876712329, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27509960159362573, "calib/std_conf": 0.05707284995008901, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21612272120942638, "calib/step_q_c_n": 2249.0, "calib/step_q_gap": -0.0042488202925498875, "calib/step_q_w": 0.22037154150197627, "calib/step_q_w_n": 1265.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 759.9296875, "completions/mean_terminated_length": 775.0677490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.14826666666666666, "grad_norm": 0.6480685472488403, "kl": 0.1359100341796875, "learning_rate": 1.7222222222222224e-06, "loss": -0.1143, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0201815627515316, "mask/share_reasoning": 0.7686899304389954, "mask/share_step_conf": 0.19159728288650513, "num_tokens": 44408140.0, "reward": 0.9629392623901367, "reward_std": 0.1242162361741066, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8882663249969482, "rewards/final_brier_reward_step": 0.7024558186531067, "rewards/format_reward_step": 0.98046875, "step": 139 }, { "adv/mean_abs_final_conf": 0.5907405614852905, "adv/mean_abs_reasoning": 0.30170968174934387, "adv/mean_abs_step_conf": 0.7475422620773315, "adv/ratio_final_to_reasoning": 1.9579768142013734, "adv/ratio_step_to_reasoning": 2.4776873507770927, "adv/std_final_conf": 0.7750344276428223, "adv/std_reasoning": 0.5727941989898682, "adv/std_step_conf": 0.9279723763465881, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.87109375, "calib/ece": 0.31264822134387354, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": 0.013956748379825412, "calib/mean_conf": 0.9806324110671938, "calib/mu_c": 0.9852662721893493, "calib/mu_w": 0.9713095238095238, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31264822134387354, "calib/std_conf": 0.05689247934867771, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26037914691943126, "calib/step_q_c_n": 2321.0, "calib/step_q_gap": 0.014282242477977658, "calib/step_q_w": 0.2460969044414536, "calib/step_q_w_n": 1486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 856.53515625, "completions/mean_terminated_length": 863.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.14933333333333335, "grad_norm": 1.0277642011642456, "kl": 0.125885009765625, "learning_rate": 1.6944444444444446e-06, "loss": -0.0211, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018999816849827766, "mask/share_reasoning": 0.7788618803024292, "mask/share_step_conf": 0.19432583451271057, "num_tokens": 44732429.0, "reward": 0.954172670841217, "reward_std": 0.12704716622829437, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.9032429456710815, "rewards/final_brier_reward_step": 0.6754148602485657, "rewards/format_reward_step": 0.98828125, "step": 140 }, { "adv/mean_abs_final_conf": 0.6153268814086914, "adv/mean_abs_reasoning": 0.3498820662498474, "adv/mean_abs_step_conf": 0.7341009378433228, "adv/ratio_final_to_reasoning": 1.7586693939588558, "adv/ratio_step_to_reasoning": 2.098138226150489, "adv/std_final_conf": 0.8021078109741211, "adv/std_reasoning": 0.6404186487197876, "adv/std_step_conf": 0.931639552116394, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 15.58984375, "calib/ece": 0.2560975609756097, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.983739837398374, "calib/gap": -0.006893327666808391, "calib/mean_conf": 0.9772357723577235, "calib/mu_c": 0.975414364640884, "calib/mu_w": 0.9823076923076924, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24878048780487802, "calib/std_conf": 0.08074149065898127, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29297265625, "calib/step_q_c_n": 2560.0, "calib/step_q_gap": 0.023681950449860267, "calib/step_q_w": 0.26929070580013975, "calib/step_q_w_n": 1431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 845.6640625, "completions/mean_terminated_length": 876.477783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.1504, "grad_norm": 1.5194544792175293, "kl": 0.11370849609375, "learning_rate": 1.6666666666666667e-06, "loss": -0.2224, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01793743669986725, "mask/share_reasoning": 0.7622042894363403, "mask/share_step_conf": 0.18470202386379242, "num_tokens": 45056015.0, "reward": 0.9502112865447998, "reward_std": 0.16413474082946777, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.8598936200141907, "rewards/final_brier_reward_step": 0.7084976434707642, "rewards/format_reward_step": 0.94921875, "step": 141 }, { "adv/mean_abs_final_conf": 0.5537828207015991, "adv/mean_abs_reasoning": 0.32727572321891785, "adv/mean_abs_step_conf": 0.7571170330047607, "adv/ratio_final_to_reasoning": 1.6920986844208072, "adv/ratio_step_to_reasoning": 2.3133919789654485, "adv/std_final_conf": 0.7993838787078857, "adv/std_reasoning": 0.618484616279602, "adv/std_step_conf": 0.9275542497634888, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.56640625, "calib/ece": 0.41956692913385824, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9803149606299213, "calib/gap": 0.009506079506079401, "calib/mean_conf": 0.9825590551181104, "calib/mu_c": 0.9867132867132865, "calib/mu_w": 0.9772072072072071, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41956692913385824, "calib/std_conf": 0.051590323444643635, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32506625891946994, "calib/step_q_c_n": 1962.0, "calib/step_q_gap": 0.03993892445427466, "calib/step_q_w": 0.2851273344651953, "calib/step_q_w_n": 1767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 882.69921875, "completions/mean_terminated_length": 889.6495971679688, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.15146666666666667, "grad_norm": 1.048315405845642, "kl": 0.1196136474609375, "learning_rate": 1.638888888888889e-06, "loss": 0.0021, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01925894245505333, "mask/share_reasoning": 0.7851055860519409, "mask/share_step_conf": 0.18782293796539307, "num_tokens": 45387146.0, "reward": 0.9025686979293823, "reward_std": 0.12361149489879608, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.9195643663406372, "rewards/final_brier_reward_step": 0.5754168033599854, "rewards/format_reward_step": 0.9921875, "step": 142 }, { "adv/mean_abs_final_conf": 0.5344308614730835, "adv/mean_abs_reasoning": 0.34488245844841003, "adv/mean_abs_step_conf": 0.7593881487846375, "adv/ratio_final_to_reasoning": 1.5496029107349554, "adv/ratio_step_to_reasoning": 2.2018752481672887, "adv/std_final_conf": 0.743278443813324, "adv/std_reasoning": 0.6185718178749084, "adv/std_step_conf": 0.9269011616706848, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.80078125, "calib/ece": 0.29174603174603175, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": 0.008374025974026367, "calib/mean_conf": 0.97984126984127, "calib/mu_c": 0.9824, "calib/mu_w": 0.9740259740259737, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2885714285714286, "calib/std_conf": 0.08011880193677592, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3766723116003387, "calib/step_q_c_n": 2362.0, "calib/step_q_gap": 0.027022876068550195, "calib/step_q_w": 0.3496494355317885, "calib/step_q_w_n": 1683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 875.7265625, "completions/mean_terminated_length": 889.6270141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.15253333333333333, "grad_norm": 0.4601934552192688, "kl": 0.1195220947265625, "learning_rate": 1.6111111111111113e-06, "loss": -0.0885, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01835741475224495, "mask/share_reasoning": 0.7799018621444702, "mask/share_step_conf": 0.18611572682857513, "num_tokens": 45718668.0, "reward": 0.9626208543777466, "reward_std": 0.13852755725383759, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.8991478681564331, "rewards/final_brier_reward_step": 0.6924999952316284, "rewards/format_reward_step": 0.984375, "step": 143 }, { "adv/mean_abs_final_conf": 0.5253238677978516, "adv/mean_abs_reasoning": 0.3445174992084503, "adv/mean_abs_step_conf": 0.7489527463912964, "adv/ratio_final_to_reasoning": 1.5248104058714427, "adv/ratio_step_to_reasoning": 2.173917865165225, "adv/std_final_conf": 0.796026349067688, "adv/std_reasoning": 0.6610748171806335, "adv/std_step_conf": 0.9308090806007385, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 14.546875, "calib/ece": 0.24505882352941175, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": -0.006634424603174649, "calib/mean_conf": 0.969607843137255, "calib/mu_c": 0.9679687500000002, "calib/mu_w": 0.9746031746031748, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2308627450980392, "calib/std_conf": 0.12340211305772951, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39171739130434785, "calib/step_q_c_n": 2760.0, "calib/step_q_gap": 0.03780660292260518, "calib/step_q_w": 0.35391078838174267, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2100.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 812.71875, "completions/mean_terminated_length": 819.1181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.1536, "grad_norm": 1.0905005931854248, "kl": 0.12884521484375, "learning_rate": 1.5833333333333333e-06, "loss": 0.0366, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020532600581645966, "mask/share_reasoning": 0.7764140963554382, "mask/share_step_conf": 0.1952407956123352, "num_tokens": 46030852.0, "reward": 1.0011918544769287, "reward_std": 0.13063669204711914, "rewards/accuracy_reward_step": 0.75, "rewards/asymmetric_l2_reward": 0.906753659248352, "rewards/final_brier_reward_step": 0.7464112639427185, "rewards/format_reward_step": 0.99609375, "step": 144 }, { "adv/mean_abs_final_conf": 0.4794083535671234, "adv/mean_abs_reasoning": 0.36883217096328735, "adv/mean_abs_step_conf": 0.7466808557510376, "adv/ratio_final_to_reasoning": 1.2998008072751401, "adv/ratio_step_to_reasoning": 2.024446115426738, "adv/std_final_conf": 0.7391177415847778, "adv/std_reasoning": 0.6815920472145081, "adv/std_step_conf": 0.9290433526039124, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.44921875, "calib/ece": 0.2816269841269842, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": -0.003268539067880716, "calib/mean_conf": 0.9848015873015874, "calib/mu_c": 0.983854748603352, "calib/mu_w": 0.9871232876712327, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27805555555555567, "calib/std_conf": 0.056071456655313746, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45672465904902326, "calib/step_q_c_n": 2713.0, "calib/step_q_gap": 0.03205477177044036, "calib/step_q_w": 0.4246698872785829, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 840.72265625, "completions/mean_terminated_length": 854.0675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.15466666666666667, "grad_norm": 0.48902198672294617, "kl": 0.119293212890625, "learning_rate": 1.5555555555555558e-06, "loss": -0.0902, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01924433559179306, "mask/share_reasoning": 0.7702668905258179, "mask/share_step_conf": 0.19486378133296967, "num_tokens": 46348781.0, "reward": 0.9674624800682068, "reward_std": 0.15784257650375366, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8958249092102051, "rewards/final_brier_reward_step": 0.703162431716919, "rewards/format_reward_step": 0.98046875, "step": 145 }, { "adv/mean_abs_final_conf": 0.5155750513076782, "adv/mean_abs_reasoning": 0.3618355989456177, "adv/mean_abs_step_conf": 0.7516404986381531, "adv/ratio_final_to_reasoning": 1.4248875810176072, "adv/ratio_step_to_reasoning": 2.0772983665189932, "adv/std_final_conf": 0.7500534057617188, "adv/std_reasoning": 0.6612176299095154, "adv/std_step_conf": 0.9279693365097046, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 16.1328125, "calib/ece": 0.39862903225806445, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.00030889067955941485, "calib/mean_conf": 0.9873387096774194, "calib/mu_c": 0.9874657534246575, "calib/mu_w": 0.9871568627450981, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39862903225806445, "calib/std_conf": 0.008043116769469763, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39041342412451363, "calib/step_q_c_n": 2056.0, "calib/step_q_gap": 0.026949103970222377, "calib/step_q_w": 0.36346432015429125, "calib/step_q_w_n": 2074.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 840.3125, "completions/mean_terminated_length": 867.4193115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 251.0, "epoch": 0.15573333333333333, "grad_norm": 0.5047119855880737, "kl": 0.1184234619140625, "learning_rate": 1.527777777777778e-06, "loss": -0.1387, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01885879971086979, "mask/share_reasoning": 0.7595796585083008, "mask/share_step_conf": 0.19031155109405518, "num_tokens": 46671117.0, "reward": 0.8923627734184265, "reward_std": 0.14321592450141907, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.896584153175354, "rewards/final_brier_reward_step": 0.5803288817405701, "rewards/format_reward_step": 0.96875, "step": 146 }, { "adv/mean_abs_final_conf": 0.4925052523612976, "adv/mean_abs_reasoning": 0.3871638774871826, "adv/mean_abs_step_conf": 0.7252216935157776, "adv/ratio_final_to_reasoning": 1.2720847191577225, "adv/ratio_step_to_reasoning": 1.8731646614934696, "adv/std_final_conf": 0.7602009177207947, "adv/std_reasoning": 0.6815853714942932, "adv/std_step_conf": 0.92861407995224, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.578125, "calib/ece": 0.35900398406374523, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00015720702327493008, "calib/mean_conf": 0.9884860557768926, "calib/mu_c": 0.9885443037974684, "calib/mu_w": 0.9883870967741935, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35900398406374523, "calib/std_conf": 0.0054387942459224025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4720518266779949, "calib/step_q_c_n": 2354.0, "calib/step_q_gap": 0.0803566002398064, "calib/step_q_w": 0.3916952264381885, "calib/step_q_w_n": 1634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1999.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 823.375, "completions/mean_terminated_length": 839.7769165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.1568, "grad_norm": 0.7529616951942444, "kl": 0.125701904296875, "learning_rate": 1.5e-06, "loss": -0.0952, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019497964531183243, "mask/share_reasoning": 0.7687634229660034, "mask/share_step_conf": 0.19220739603042603, "num_tokens": 46985581.0, "reward": 0.9207668304443359, "reward_std": 0.15520966053009033, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8965362906455994, "rewards/final_brier_reward_step": 0.625465989112854, "rewards/format_reward_step": 0.98046875, "step": 147 }, { "adv/mean_abs_final_conf": 0.43908268213272095, "adv/mean_abs_reasoning": 0.31588056683540344, "adv/mean_abs_step_conf": 0.7488313913345337, "adv/ratio_final_to_reasoning": 1.3900275237935567, "adv/ratio_step_to_reasoning": 2.3706155742234336, "adv/std_final_conf": 0.6820304989814758, "adv/std_reasoning": 0.6185115575790405, "adv/std_step_conf": 0.9277695417404175, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.48046875, "calib/ece": 0.22082677165354336, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00013124728378988149, "calib/mean_conf": 0.9885433070866142, "calib/mu_c": 0.9885128205128204, "calib/mu_w": 0.9886440677966103, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22082677165354336, "calib/std_conf": 0.004237544797824781, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4459600903614458, "calib/step_q_c_n": 2656.0, "calib/step_q_gap": 0.07615990006648865, "calib/step_q_w": 0.36980019029495714, "calib/step_q_w_n": 1051.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 787.65625, "completions/mean_terminated_length": 796.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.15786666666666666, "grad_norm": 0.8388846516609192, "kl": 0.1266632080078125, "learning_rate": 1.4722222222222225e-06, "loss": -0.0294, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021315710619091988, "mask/share_reasoning": 0.7700055837631226, "mask/share_step_conf": 0.1969599425792694, "num_tokens": 47292333.0, "reward": 1.0063964128494263, "reward_std": 0.12641800940036774, "rewards/accuracy_reward_step": 0.76171875, "rewards/asymmetric_l2_reward": 0.8952064514160156, "rewards/final_brier_reward_step": 0.7668050527572632, "rewards/format_reward_step": 0.9921875, "step": 148 }, { "adv/mean_abs_final_conf": 0.5059627890586853, "adv/mean_abs_reasoning": 0.3728574216365814, "adv/mean_abs_step_conf": 0.7347331643104553, "adv/ratio_final_to_reasoning": 1.356987308547769, "adv/ratio_step_to_reasoning": 1.9705472431941795, "adv/std_final_conf": 0.7392430901527405, "adv/std_reasoning": 0.6612235307693481, "adv/std_step_conf": 0.9278441667556763, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 16.3046875, "calib/ece": 0.28489959839357437, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007806949806946628, "calib/mean_conf": 0.987710843373494, "calib/mu_c": 0.987942857142857, "calib/mu_w": 0.9871621621621623, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28489959839357437, "calib/std_conf": 0.00633571527483214, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44185981680605335, "calib/step_q_c_n": 2511.0, "calib/step_q_gap": 0.038519468038765325, "calib/step_q_w": 0.403340348767288, "calib/step_q_w_n": 1663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 881.9609375, "completions/mean_terminated_length": 906.7550048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.15893333333333334, "grad_norm": 0.7782192826271057, "kl": 0.11077880859375, "learning_rate": 1.4444444444444445e-06, "loss": -0.1087, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018333647400140762, "mask/share_reasoning": 0.7683661580085754, "mask/share_step_conf": 0.1859564483165741, "num_tokens": 47622571.0, "reward": 0.9591616988182068, "reward_std": 0.1404280960559845, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.896243691444397, "rewards/final_brier_reward_step": 0.6908296942710876, "rewards/format_reward_step": 0.97265625, "step": 149 }, { "adv/mean_abs_final_conf": 0.39306920766830444, "adv/mean_abs_reasoning": 0.2801649570465088, "adv/mean_abs_step_conf": 0.7405670881271362, "adv/ratio_final_to_reasoning": 1.4029920508689921, "adv/ratio_step_to_reasoning": 2.643325189324796, "adv/std_final_conf": 0.6572929620742798, "adv/std_reasoning": 0.5959498882293701, "adv/std_step_conf": 0.9278497099876404, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 13.44140625, "calib/ece": 0.2879918032786887, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003276455980133486, "calib/mean_conf": 0.9888114754098362, "calib/mu_c": 0.9887134502923977, "calib/mu_w": 0.989041095890411, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2879918032786887, "calib/std_conf": 0.004320807391181704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3801802240623478, "calib/step_q_c_n": 2053.0, "calib/step_q_gap": 0.02972633357243426, "calib/step_q_w": 0.35045389048991354, "calib/step_q_w_n": 1388.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 660.57421875, "completions/mean_terminated_length": 695.91357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.16, "grad_norm": 0.4880688190460205, "kl": 0.148956298828125, "learning_rate": 1.4166666666666667e-06, "loss": -0.1749, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02294372022151947, "mask/share_reasoning": 0.7396095395088196, "mask/share_step_conf": 0.18666550517082214, "num_tokens": 47896638.0, "reward": 0.934480607509613, "reward_std": 0.11018224060535431, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8715198636054993, "rewards/final_brier_reward_step": 0.6740038990974426, "rewards/format_reward_step": 0.94921875, "step": 150 }, { "adv/mean_abs_final_conf": 0.5228256583213806, "adv/mean_abs_reasoning": 0.38405531644821167, "adv/mean_abs_step_conf": 0.7402350902557373, "adv/ratio_final_to_reasoning": 1.3613290480041607, "adv/ratio_step_to_reasoning": 1.9274178967277884, "adv/std_final_conf": 0.7731865644454956, "adv/std_reasoning": 0.6816514134407043, "adv/std_step_conf": 0.9285038709640503, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.90625, "calib/ece": 0.36902834008097174, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0011229314420807546, "calib/mean_conf": 0.9884615384615385, "calib/mu_c": 0.9888888888888889, "calib/mu_w": 0.9877659574468082, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36902834008097174, "calib/std_conf": 0.006974616967523194, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3618729096989966, "calib/step_q_c_n": 2392.0, "calib/step_q_gap": 0.046783623984710865, "calib/step_q_w": 0.31508928571428574, "calib/step_q_w_n": 1680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 850.734375, "completions/mean_terminated_length": 878.1773681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.16106666666666666, "grad_norm": 1.097434401512146, "kl": 0.1197662353515625, "learning_rate": 1.3888888888888892e-06, "loss": -0.1488, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018286636099219322, "mask/share_reasoning": 0.7616772651672363, "mask/share_step_conf": 0.18878605961799622, "num_tokens": 48221450.0, "reward": 0.9045898914337158, "reward_std": 0.16505730152130127, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.8902146220207214, "rewards/final_brier_reward_step": 0.606465220451355, "rewards/format_reward_step": 0.96484375, "step": 151 }, { "adv/mean_abs_final_conf": 0.5161610841751099, "adv/mean_abs_reasoning": 0.3650969862937927, "adv/mean_abs_step_conf": 0.7628662586212158, "adv/ratio_final_to_reasoning": 1.413764297029163, "adv/ratio_step_to_reasoning": 2.089489333684445, "adv/std_final_conf": 0.759178638458252, "adv/std_reasoning": 0.6612246036529541, "adv/std_step_conf": 0.9261345863342285, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.07421875, "calib/ece": 0.31114285714285717, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001273448223275686, "calib/mean_conf": 0.9886938775510205, "calib/mu_c": 0.9887349397590361, "calib/mu_w": 0.9886075949367086, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31114285714285717, "calib/std_conf": 0.003715406872726624, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29652740341419587, "calib/step_q_c_n": 2226.0, "calib/step_q_gap": 0.00012813825804158796, "calib/step_q_w": 0.2963992651561543, "calib/step_q_w_n": 1633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 795.3515625, "completions/mean_terminated_length": 831.0612182617188, "completions/min_length": 0.0, "completions/min_terminated_length": 304.0, "epoch": 0.16213333333333332, "grad_norm": 0.7657935619354248, "kl": 0.134735107421875, "learning_rate": 1.3611111111111112e-06, "loss": -0.1353, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019217420369386673, "mask/share_reasoning": 0.7501330375671387, "mask/share_step_conf": 0.18768081068992615, "num_tokens": 48530452.0, "reward": 0.9302579760551453, "reward_std": 0.14772218465805054, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.8840888738632202, "rewards/final_brier_reward_step": 0.6553331613540649, "rewards/format_reward_step": 0.95703125, "step": 152 }, { "adv/mean_abs_final_conf": 0.4120514392852783, "adv/mean_abs_reasoning": 0.3320033550262451, "adv/mean_abs_step_conf": 0.7646507024765015, "adv/ratio_final_to_reasoning": 1.2411062510278106, "adv/ratio_step_to_reasoning": 2.3031414920974376, "adv/std_final_conf": 0.663988471031189, "adv/std_reasoning": 0.6186351180076599, "adv/std_step_conf": 0.9280796647071838, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 16.66796875, "calib/ece": 0.2630801687763714, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00027996422182452463, "calib/mean_conf": 0.988818565400844, "calib/mu_c": 0.9888953488372092, "calib/mu_w": 0.9886153846153847, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2630801687763714, "calib/std_conf": 0.0033559559198562345, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2575592515592516, "calib/step_q_c_n": 2405.0, "calib/step_q_gap": -0.04960508786072698, "calib/step_q_w": 0.30716433941997856, "calib/step_q_w_n": 1862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 818.0703125, "completions/mean_terminated_length": 883.6539916992188, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.1632, "grad_norm": 0.3014471232891083, "kl": 0.117950439453125, "learning_rate": 1.3333333333333334e-06, "loss": -0.2167, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.016769377514719963, "mask/share_reasoning": 0.7328798770904541, "mask/share_step_conf": 0.1761319935321808, "num_tokens": 48847198.0, "reward": 0.9193028211593628, "reward_std": 0.1415901482105255, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8424032330513, "rewards/final_brier_reward_step": 0.6774523258209229, "rewards/format_reward_step": 0.921875, "step": 153 }, { "adv/mean_abs_final_conf": 0.49666839838027954, "adv/mean_abs_reasoning": 0.3581945300102234, "adv/mean_abs_step_conf": 0.7564102411270142, "adv/ratio_final_to_reasoning": 1.3865884506000243, "adv/ratio_step_to_reasoning": 2.1117302966782465, "adv/std_final_conf": 0.7572185397148132, "adv/std_reasoning": 0.6815637946128845, "adv/std_step_conf": 0.9290932416915894, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 16.56640625, "calib/ece": 0.39849593495934965, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.0005141686582449401, "calib/mean_conf": 0.9879268292682928, "calib/mu_c": 0.9881379310344826, "calib/mu_w": 0.9876237623762376, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39849593495934965, "calib/std_conf": 0.007715121601671246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2583745247148289, "calib/step_q_c_n": 2104.0, "calib/step_q_gap": -0.011095760732059257, "calib/step_q_w": 0.26947028544688817, "calib/step_q_w_n": 2137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 861.01171875, "completions/mean_terminated_length": 888.7862548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.16426666666666667, "grad_norm": 3.4770874977111816, "kl": 0.1408843994140625, "learning_rate": 1.3055555555555556e-06, "loss": -0.083, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017684850841760635, "mask/share_reasoning": 0.7619322538375854, "mask/share_step_conf": 0.1891328990459442, "num_tokens": 49172057.0, "reward": 0.8812761306762695, "reward_std": 0.15116822719573975, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.8822709918022156, "rewards/final_brier_reward_step": 0.5755937099456787, "rewards/format_reward_step": 0.95703125, "step": 154 }, { "adv/mean_abs_final_conf": 0.5971670150756836, "adv/mean_abs_reasoning": 0.5222394466400146, "adv/mean_abs_step_conf": 0.7686342000961304, "adv/ratio_final_to_reasoning": 1.1434735903573314, "adv/ratio_step_to_reasoning": 1.4718041791775225, "adv/std_final_conf": 0.798547625541687, "adv/std_reasoning": 0.7576011419296265, "adv/std_step_conf": 0.928086519241333, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.3984375, "calib/ece": 0.37877470355731224, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.002813852813853046, "calib/mean_conf": 0.9874703557312253, "calib/mu_c": 0.9885714285714287, "calib/mu_w": 0.9857575757575756, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37877470355731224, "calib/std_conf": 0.008752419804981097, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2645356176735798, "calib/step_q_c_n": 2218.0, "calib/step_q_gap": 0.03115974760397422, "calib/step_q_w": 0.23337587006960558, "calib/step_q_w_n": 1724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 800.42578125, "completions/mean_terminated_length": 809.9170532226562, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.16533333333333333, "grad_norm": 0.8505538105964661, "kl": 0.134918212890625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0351, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019242044538259506, "mask/share_reasoning": 0.7679029703140259, "mask/share_step_conf": 0.20113623142242432, "num_tokens": 49484182.0, "reward": 0.9149356484413147, "reward_std": 0.17841386795043945, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.8995552062988281, "rewards/final_brier_reward_step": 0.6123472452163696, "rewards/format_reward_step": 0.98828125, "step": 155 }, { "adv/mean_abs_final_conf": 0.508908748626709, "adv/mean_abs_reasoning": 0.39080560207366943, "adv/mean_abs_step_conf": 0.7372856736183167, "adv/ratio_final_to_reasoning": 1.3022043336287086, "adv/ratio_step_to_reasoning": 1.8865790810218055, "adv/std_final_conf": 0.7618434429168701, "adv/std_reasoning": 0.6815579533576965, "adv/std_step_conf": 0.9298255443572998, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.9921875, "calib/ece": 0.3233603238866396, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.001056420805171876, "calib/mean_conf": 0.987327935222672, "calib/mu_c": 0.9876829268292683, "calib/mu_w": 0.9866265060240964, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3233603238866396, "calib/std_conf": 0.009272404150631377, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25019014693172, "calib/step_q_c_n": 2314.0, "calib/step_q_gap": -0.030169403630077807, "calib/step_q_w": 0.2803595505617978, "calib/step_q_w_n": 1780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 855.875, "completions/mean_terminated_length": 876.416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.1664, "grad_norm": 0.6830697059631348, "kl": 0.12420654296875, "learning_rate": 1.25e-06, "loss": -0.0564, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018573619425296783, "mask/share_reasoning": 0.766772449016571, "mask/share_step_conf": 0.191216379404068, "num_tokens": 49808046.0, "reward": 0.9106454849243164, "reward_std": 0.1469493806362152, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.8511382937431335, "rewards/final_brier_reward_step": 0.6490589380264282, "rewards/format_reward_step": 0.96484375, "step": 156 }, { "adv/mean_abs_final_conf": 0.4624965488910675, "adv/mean_abs_reasoning": 0.34471628069877625, "adv/mean_abs_step_conf": 0.767410159111023, "adv/ratio_final_to_reasoning": 1.341673065030576, "adv/ratio_step_to_reasoning": 2.2262080501547583, "adv/std_final_conf": 0.6862419843673706, "adv/std_reasoning": 0.6186152696609497, "adv/std_step_conf": 0.9305785298347473, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.95703125, "calib/ece": 0.24424000000000012, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0025420026881719515, "calib/mean_conf": 0.9873600000000001, "calib/mu_c": 0.9880107526881721, "calib/mu_w": 0.9854687500000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24380000000000013, "calib/std_conf": 0.009603665966702509, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3088867995487025, "calib/step_q_c_n": 2659.0, "calib/step_q_gap": 0.05935688501878794, "calib/step_q_w": 0.24952991452991458, "calib/step_q_w_n": 1170.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 803.671875, "completions/mean_terminated_length": 822.9600219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.16746666666666668, "grad_norm": 0.7847764492034912, "kl": 0.12835693359375, "learning_rate": 1.2222222222222223e-06, "loss": -0.0895, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020666757598519325, "mask/share_reasoning": 0.7637290358543396, "mask/share_step_conf": 0.19216671586036682, "num_tokens": 50117514.0, "reward": 0.9793211221694946, "reward_std": 0.13415268063545227, "rewards/accuracy_reward_step": 0.7265625, "rewards/asymmetric_l2_reward": 0.8844351172447205, "rewards/final_brier_reward_step": 0.7335820198059082, "rewards/format_reward_step": 0.9765625, "step": 157 }, { "adv/mean_abs_final_conf": 0.5322049856185913, "adv/mean_abs_reasoning": 0.33812499046325684, "adv/mean_abs_step_conf": 0.7591421604156494, "adv/ratio_final_to_reasoning": 1.5739889112881902, "adv/ratio_step_to_reasoning": 2.2451524786013812, "adv/std_final_conf": 0.7345767617225647, "adv/std_reasoning": 0.596189022064209, "adv/std_step_conf": 0.9277385473251343, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.640625, "calib/ece": 0.2602811244979919, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": -0.0008327916802078894, "calib/mean_conf": 0.9871887550200803, "calib/mu_c": 0.9869613259668507, "calib/mu_w": 0.9877941176470586, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2602811244979919, "calib/std_conf": 0.009102744304423354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26329842061512887, "calib/step_q_c_n": 2406.0, "calib/step_q_gap": -0.027007093542844307, "calib/step_q_w": 0.2903055141579732, "calib/step_q_w_n": 1342.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2311.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 789.5703125, "completions/mean_terminated_length": 811.7670288085938, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.16853333333333334, "grad_norm": 0.41278529167175293, "kl": 0.139312744140625, "learning_rate": 1.1944444444444446e-06, "loss": -0.1218, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020901184529066086, "mask/share_reasoning": 0.7587811946868896, "mask/share_step_conf": 0.19297386705875397, "num_tokens": 50424884.0, "reward": 0.9677097797393799, "reward_std": 0.13393598794937134, "rewards/accuracy_reward_step": 0.70703125, "rewards/asymmetric_l2_reward": 0.8862066268920898, "rewards/final_brier_reward_step": 0.7132753729820251, "rewards/format_reward_step": 0.97265625, "step": 158 }, { "adv/mean_abs_final_conf": 0.5357747673988342, "adv/mean_abs_reasoning": 0.36947643756866455, "adv/mean_abs_step_conf": 0.7469610571861267, "adv/ratio_final_to_reasoning": 1.45009184056362, "adv/ratio_step_to_reasoning": 2.021674405278711, "adv/std_final_conf": 0.7742729187011719, "adv/std_reasoning": 0.6612566113471985, "adv/std_step_conf": 0.9283041954040527, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.3671875, "calib/ece": 0.32085020242914986, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": -0.0011315594974131704, "calib/mean_conf": 0.9859514170040484, "calib/mu_c": 0.9855757575757575, "calib/mu_w": 0.9867073170731707, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31939271255060736, "calib/std_conf": 0.02376942350885938, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27897385298470645, "calib/step_q_c_n": 2027.0, "calib/step_q_gap": -0.016004947742125797, "calib/step_q_w": 0.29497880072683225, "calib/step_q_w_n": 1651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 740.125, "completions/mean_terminated_length": 767.0931396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.1696, "grad_norm": 0.7112388610839844, "kl": 0.14520263671875, "learning_rate": 1.1666666666666668e-06, "loss": -0.2309, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02084175869822502, "mask/share_reasoning": 0.7566808462142944, "mask/share_step_conf": 0.18732118606567383, "num_tokens": 50719140.0, "reward": 0.925193727016449, "reward_std": 0.14622630178928375, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8762010335922241, "rewards/final_brier_reward_step": 0.6523113250732422, "rewards/format_reward_step": 0.96484375, "step": 159 }, { "adv/mean_abs_final_conf": 0.5323375463485718, "adv/mean_abs_reasoning": 0.42106419801712036, "adv/mean_abs_step_conf": 0.7754355072975159, "adv/ratio_final_to_reasoning": 1.2642669427974664, "adv/ratio_step_to_reasoning": 1.8416087403042205, "adv/std_final_conf": 0.750114381313324, "adv/std_reasoning": 0.6817347407341003, "adv/std_step_conf": 0.9293457269668579, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 14.78125, "calib/ece": 0.26455284552845526, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9878048780487805, "calib/gap": 0.0018704560475877319, "calib/mean_conf": 0.9872357723577236, "calib/mu_c": 0.987752808988764, "calib/mu_w": 0.9858823529411763, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26410569105691056, "calib/std_conf": 0.012184277020705191, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33007648953301133, "calib/step_q_c_n": 2484.0, "calib/step_q_gap": 0.031088027994549794, "calib/step_q_w": 0.29898846153846154, "calib/step_q_w_n": 1300.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 761.1484375, "completions/mean_terminated_length": 792.0894165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.17066666666666666, "grad_norm": 1.3006924390792847, "kl": 0.128509521484375, "learning_rate": 1.138888888888889e-06, "loss": -0.1998, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019468510523438454, "mask/share_reasoning": 0.7541245222091675, "mask/share_step_conf": 0.18734446167945862, "num_tokens": 51018834.0, "reward": 0.9500536918640137, "reward_std": 0.17531615495681763, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.866344153881073, "rewards/final_brier_reward_step": 0.7025132775306702, "rewards/format_reward_step": 0.9609375, "step": 160 }, { "adv/mean_abs_final_conf": 0.4645047187805176, "adv/mean_abs_reasoning": 0.3434193730354309, "adv/mean_abs_step_conf": 0.7499098777770996, "adv/ratio_final_to_reasoning": 1.3525874055235496, "adv/ratio_step_to_reasoning": 2.1836563008917107, "adv/std_final_conf": 0.7322902083396912, "adv/std_reasoning": 0.6611133813858032, "adv/std_step_conf": 0.9290652275085449, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.8671875, "calib/ece": 0.19295275590551186, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": -0.0005597867479054663, "calib/mean_conf": 0.9857086614173228, "calib/mu_c": 0.9855940594059408, "calib/mu_w": 0.9861538461538463, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19169291338582684, "calib/std_conf": 0.017727424475178482, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34820607857672353, "calib/step_q_c_n": 2698.0, "calib/step_q_gap": 0.07929175932789723, "calib/step_q_w": 0.2689143192488263, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 763.61328125, "completions/mean_terminated_length": 766.60791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.17173333333333332, "grad_norm": 1.2806881666183472, "kl": 0.1336669921875, "learning_rate": 1.111111111111111e-06, "loss": 0.0085, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021376177668571472, "mask/share_reasoning": 0.7761213779449463, "mask/share_step_conf": 0.19859617948532104, "num_tokens": 51318239.0, "reward": 1.0278555154800415, "reward_std": 0.13048477470874786, "rewards/accuracy_reward_step": 0.7890625, "rewards/asymmetric_l2_reward": 0.9052879214286804, "rewards/final_brier_reward_step": 0.794173002243042, "rewards/format_reward_step": 0.9921875, "step": 161 }, { "adv/mean_abs_final_conf": 0.47606128454208374, "adv/mean_abs_reasoning": 0.3419013023376465, "adv/mean_abs_step_conf": 0.7258902192115784, "adv/ratio_final_to_reasoning": 1.3923938905384654, "adv/ratio_step_to_reasoning": 2.123098725417318, "adv/std_final_conf": 0.7196078896522522, "adv/std_reasoning": 0.6401966214179993, "adv/std_step_conf": 0.9285182952880859, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.98046875, "calib/ece": 0.19383399209486168, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0018790662074242848, "calib/mean_conf": 0.9868774703557313, "calib/mu_c": 0.9872636815920398, "calib/mu_w": 0.9853846153846155, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1931225296442688, "calib/std_conf": 0.014341019084552832, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38867825722664434, "calib/step_q_c_n": 2387.0, "calib/step_q_gap": 0.04988552218390929, "calib/step_q_w": 0.33879273504273505, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 761.84765625, "completions/mean_terminated_length": 770.8814697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.1728, "grad_norm": 0.5485326647758484, "kl": 0.1383514404296875, "learning_rate": 1.0833333333333335e-06, "loss": -0.058, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021963439881801605, "mask/share_reasoning": 0.7701320648193359, "mask/share_step_conf": 0.19618576765060425, "num_tokens": 51617416.0, "reward": 1.0232549905776978, "reward_std": 0.13246524333953857, "rewards/accuracy_reward_step": 0.78515625, "rewards/asymmetric_l2_reward": 0.9011022448539734, "rewards/final_brier_reward_step": 0.7907202839851379, "rewards/format_reward_step": 0.98828125, "step": 162 }, { "adv/mean_abs_final_conf": 0.5903889536857605, "adv/mean_abs_reasoning": 0.4718813896179199, "adv/mean_abs_step_conf": 0.7356910109519958, "adv/ratio_final_to_reasoning": 1.2511384569834287, "adv/ratio_step_to_reasoning": 1.55905917702684, "adv/std_final_conf": 0.8337865471839905, "adv/std_reasoning": 0.7575337290763855, "adv/std_step_conf": 0.9304379820823669, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.47265625, "calib/ece": 0.3831048387096776, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": -0.00120680272108864, "calib/mean_conf": 0.987943548387097, "calib/mu_c": 0.9874666666666665, "calib/mu_w": 0.9886734693877551, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3831048387096776, "calib/std_conf": 0.00838855058739805, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42163127413127416, "calib/step_q_c_n": 2072.0, "calib/step_q_gap": 0.0647757950418088, "calib/step_q_w": 0.35685547908946536, "calib/step_q_w_n": 1889.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 833.6875, "completions/mean_terminated_length": 857.1244506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.17386666666666667, "grad_norm": 0.5227140784263611, "kl": 0.13299560546875, "learning_rate": 1.0555555555555557e-06, "loss": -0.111, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020353922620415688, "mask/share_reasoning": 0.7606555819511414, "mask/share_step_conf": 0.19164671003818512, "num_tokens": 51935672.0, "reward": 0.8928499817848206, "reward_std": 0.18691426515579224, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8803619742393494, "rewards/final_brier_reward_step": 0.5944004058837891, "rewards/format_reward_step": 0.96875, "step": 163 }, { "adv/mean_abs_final_conf": 0.5669471025466919, "adv/mean_abs_reasoning": 0.4614078402519226, "adv/mean_abs_step_conf": 0.7551431655883789, "adv/ratio_final_to_reasoning": 1.2287331360410916, "adv/ratio_step_to_reasoning": 1.6366067060674145, "adv/std_final_conf": 0.8116291761398315, "adv/std_reasoning": 0.7393824458122253, "adv/std_step_conf": 0.93180251121521, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 16.48828125, "calib/ece": 0.30372384937238484, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9790794979079498, "calib/gap": -8.395221181756174e-05, "calib/mean_conf": 0.9857322175732216, "calib/mu_c": 0.9857055214723928, "calib/mu_w": 0.9857894736842103, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30372384937238484, "calib/std_conf": 0.016060406619159598, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4359049360146252, "calib/step_q_c_n": 2188.0, "calib/step_q_gap": 0.016214822881324642, "calib/step_q_w": 0.41969011313330057, "calib/step_q_w_n": 2033.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 833.484375, "completions/mean_terminated_length": 881.7024536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.17493333333333333, "grad_norm": 0.6440178751945496, "kl": 0.1234283447265625, "learning_rate": 1.0277777777777777e-06, "loss": -0.1814, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017240755259990692, "mask/share_reasoning": 0.7502830028533936, "mask/share_step_conf": 0.17778876423835754, "num_tokens": 52255180.0, "reward": 0.8986508846282959, "reward_std": 0.19734317064285278, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.838513970375061, "rewards/final_brier_reward_step": 0.6447253227233887, "rewards/format_reward_step": 0.93359375, "step": 164 }, { "adv/mean_abs_final_conf": 0.48889780044555664, "adv/mean_abs_reasoning": 0.4206387400627136, "adv/mean_abs_step_conf": 0.7257621884346008, "adv/ratio_final_to_reasoning": 1.1622747832799856, "adv/ratio_step_to_reasoning": 1.7253812340879395, "adv/std_final_conf": 0.7378907203674316, "adv/std_reasoning": 0.720567524433136, "adv/std_step_conf": 0.9300297498703003, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.4921875, "calib/ece": 0.3847410358565739, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.0007225165562912972, "calib/mean_conf": 0.9863346613545817, "calib/mu_c": 0.9866225165562913, "calib/mu_w": 0.9859, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3847410358565739, "calib/std_conf": 0.014644239100395154, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42854312938816447, "calib/step_q_c_n": 1994.0, "calib/step_q_gap": 0.09999417833921342, "calib/step_q_w": 0.32854895104895104, "calib/step_q_w_n": 1716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 839.66796875, "completions/mean_terminated_length": 856.3944702148438, "completions/min_length": 0.0, "completions/min_terminated_length": 249.0, "epoch": 0.176, "grad_norm": 0.7458361387252808, "kl": 0.126434326171875, "learning_rate": 1.0000000000000002e-06, "loss": -0.1276, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01900230348110199, "mask/share_reasoning": 0.7809528112411499, "mask/share_step_conf": 0.18051370978355408, "num_tokens": 52575711.0, "reward": 0.9076608419418335, "reward_std": 0.16692927479743958, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.9016520977020264, "rewards/final_brier_reward_step": 0.6003882884979248, "rewards/format_reward_step": 0.9765625, "step": 165 }, { "adv/mean_abs_final_conf": 0.4326574206352234, "adv/mean_abs_reasoning": 0.368915855884552, "adv/mean_abs_step_conf": 0.7197380065917969, "adv/ratio_final_to_reasoning": 1.1727807675759498, "adv/ratio_step_to_reasoning": 1.9509543846145518, "adv/std_final_conf": 0.7187753319740295, "adv/std_reasoning": 0.6816666722297668, "adv/std_step_conf": 0.9274219870567322, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.76171875, "calib/ece": 0.24892000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0021330561330561437, "calib/mean_conf": 0.9880400000000001, "calib/mu_c": 0.9885945945945946, "calib/mu_w": 0.9864615384615385, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24848000000000015, "calib/std_conf": 0.010419136240591157, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4181393475634314, "calib/step_q_c_n": 2483.0, "calib/step_q_gap": 0.0673908907733079, "calib/step_q_w": 0.3507484567901235, "calib/step_q_w_n": 1296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 830.58984375, "completions/mean_terminated_length": 847.135498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.17706666666666668, "grad_norm": 0.40786710381507874, "kl": 0.133544921875, "learning_rate": 9.722222222222224e-07, "loss": -0.0629, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019782882183790207, "mask/share_reasoning": 0.7662943601608276, "mask/share_step_conf": 0.19439147412776947, "num_tokens": 52894526.0, "reward": 0.9789649844169617, "reward_std": 0.1657336801290512, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.8896597027778625, "rewards/final_brier_reward_step": 0.7292078137397766, "rewards/format_reward_step": 0.97265625, "step": 166 }, { "adv/mean_abs_final_conf": 0.3472585082054138, "adv/mean_abs_reasoning": 0.2632824182510376, "adv/mean_abs_step_conf": 0.7502952814102173, "adv/ratio_final_to_reasoning": 1.3189582141953198, "adv/ratio_step_to_reasoning": 2.849773586836387, "adv/std_final_conf": 0.6401779055595398, "adv/std_reasoning": 0.5725346207618713, "adv/std_step_conf": 0.9276172518730164, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 14.046875, "calib/ece": 0.1541338582677165, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.0014937106918236909, "calib/mean_conf": 0.9879133858267717, "calib/mu_c": 0.9881603773584906, "calib/mu_w": 0.9866666666666669, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15370078740157478, "calib/std_conf": 0.012230414922074322, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4227017195767196, "calib/step_q_c_n": 3024.0, "calib/step_q_gap": 0.023156265031265055, "calib/step_q_w": 0.39954545454545454, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 816.0078125, "completions/mean_terminated_length": 822.4330444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.17813333333333334, "grad_norm": 0.6228464245796204, "kl": 0.1303558349609375, "learning_rate": 9.444444444444445e-07, "loss": 0.0063, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019823431968688965, "mask/share_reasoning": 0.7804555892944336, "mask/share_step_conf": 0.19190850853919983, "num_tokens": 53209032.0, "reward": 1.0440398454666138, "reward_std": 0.11119484156370163, "rewards/accuracy_reward_step": 0.83203125, "rewards/asymmetric_l2_reward": 0.8957162499427795, "rewards/final_brier_reward_step": 0.8283007740974426, "rewards/format_reward_step": 0.98828125, "step": 167 }, { "adv/mean_abs_final_conf": 0.37970471382141113, "adv/mean_abs_reasoning": 0.3342408239841461, "adv/mean_abs_step_conf": 0.752666711807251, "adv/ratio_final_to_reasoning": 1.1360213551873648, "adv/ratio_step_to_reasoning": 2.2518694839112525, "adv/std_final_conf": 0.6690989136695862, "adv/std_reasoning": 0.640317440032959, "adv/std_step_conf": 0.92930006980896, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 15.7578125, "calib/ece": 0.2583673469387756, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.0006771626883359527, "calib/mean_conf": 0.9889795918367348, "calib/mu_c": 0.9891620111731844, "calib/mu_w": 0.9884848484848484, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2583673469387756, "calib/std_conf": 0.008242472870266313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41947681331747916, "calib/step_q_c_n": 2523.0, "calib/step_q_gap": 0.023143259379689696, "calib/step_q_w": 0.39633355393778946, "calib/step_q_w_n": 1511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 866.0546875, "completions/mean_terminated_length": 901.2601318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.1792, "grad_norm": 0.5811911821365356, "kl": 0.119293212890625, "learning_rate": 9.166666666666666e-07, "loss": -0.1434, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018090544268488884, "mask/share_reasoning": 0.7572444677352905, "mask/share_step_conf": 0.18560248613357544, "num_tokens": 53535414.0, "reward": 0.9531458616256714, "reward_std": 0.14919346570968628, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.87006676197052, "rewards/final_brier_reward_step": 0.7049750089645386, "rewards/format_reward_step": 0.95703125, "step": 168 }, { "adv/mean_abs_final_conf": 0.37095749378204346, "adv/mean_abs_reasoning": 0.33360251784324646, "adv/mean_abs_step_conf": 0.7346676588058472, "adv/ratio_final_to_reasoning": 1.1119745024117276, "adv/ratio_step_to_reasoning": 2.202224562199059, "adv/std_final_conf": 0.6442214846611023, "adv/std_reasoning": 0.6185654401779175, "adv/std_step_conf": 0.9274269938468933, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.88671875, "calib/ece": 0.33363241106719366, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003332640908462192, "calib/mean_conf": 0.9897588932806324, "calib/mu_c": 0.9898734939759034, "calib/mu_w": 0.9895402298850572, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33363241106719366, "calib/std_conf": 0.0019543126427486036, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4195111492281304, "calib/step_q_c_n": 2332.0, "calib/step_q_gap": 0.04443339398810331, "calib/step_q_w": 0.37507775524002707, "calib/step_q_w_n": 1479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 837.70703125, "completions/mean_terminated_length": 851.0040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.18026666666666666, "grad_norm": 0.3905693292617798, "kl": 0.1243896484375, "learning_rate": 8.88888888888889e-07, "loss": -0.0414, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01913364976644516, "mask/share_reasoning": 0.7777827978134155, "mask/share_step_conf": 0.18745854496955872, "num_tokens": 53854051.0, "reward": 0.9391463398933411, "reward_std": 0.13413624465465546, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.8955094814300537, "rewards/final_brier_reward_step": 0.6554394364356995, "rewards/format_reward_step": 0.98828125, "step": 169 }, { "adv/mean_abs_final_conf": 0.4358246326446533, "adv/mean_abs_reasoning": 0.37791427969932556, "adv/mean_abs_step_conf": 0.7431762218475342, "adv/ratio_final_to_reasoning": 1.153236741917776, "adv/ratio_step_to_reasoning": 1.9665206153067745, "adv/std_final_conf": 0.7357816696166992, "adv/std_reasoning": 0.6815745830535889, "adv/std_step_conf": 0.9289593696594238, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.87109375, "calib/ece": 0.2361811023622048, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.0017609906091577976, "calib/mean_conf": 0.9881496062992127, "calib/mu_c": 0.988586387434555, "calib/mu_w": 0.9868253968253972, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2361811023622048, "calib/std_conf": 0.01230042489513696, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4354798172124905, "calib/step_q_c_n": 2626.0, "calib/step_q_gap": 0.08263333072600398, "calib/step_q_w": 0.35284648648648653, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 831.35546875, "completions/mean_terminated_length": 834.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.18133333333333335, "grad_norm": 0.47513461112976074, "kl": 0.128173828125, "learning_rate": 8.611111111111112e-07, "loss": -0.039, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01948484592139721, "mask/share_reasoning": 0.7853529453277588, "mask/share_step_conf": 0.19125592708587646, "num_tokens": 54171030.0, "reward": 0.9985044598579407, "reward_std": 0.15476909279823303, "rewards/accuracy_reward_step": 0.74609375, "rewards/asymmetric_l2_reward": 0.8979229927062988, "rewards/final_brier_reward_step": 0.7522109150886536, "rewards/format_reward_step": 0.98828125, "step": 170 }, { "adv/mean_abs_final_conf": 0.4136347770690918, "adv/mean_abs_reasoning": 0.3237272799015045, "adv/mean_abs_step_conf": 0.738847017288208, "adv/ratio_final_to_reasoning": 1.2777260451913168, "adv/ratio_step_to_reasoning": 2.2823131171182283, "adv/std_final_conf": 0.6824086904525757, "adv/std_reasoning": 0.5960997939109802, "adv/std_step_conf": 0.9261810183525085, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.30859375, "calib/ece": 0.3652208835341366, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9799196787148594, "calib/gap": 0.0016204529855871064, "calib/mean_conf": 0.987710843373494, "calib/mu_c": 0.9883225806451613, "calib/mu_w": 0.9867021276595742, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3652208835341366, "calib/std_conf": 0.013109851092757049, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4757076132648295, "calib/step_q_c_n": 2141.0, "calib/step_q_gap": 0.11602819819171367, "calib/step_q_w": 0.35967941507311585, "calib/step_q_w_n": 1778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 807.4375, "completions/mean_terminated_length": 826.8160400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.1824, "grad_norm": 0.8619520664215088, "kl": 0.1334381103515625, "learning_rate": 8.333333333333333e-07, "loss": -0.1273, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019145745784044266, "mask/share_reasoning": 0.7690473794937134, "mask/share_step_conf": 0.18836939334869385, "num_tokens": 54484630.0, "reward": 0.9073147773742676, "reward_std": 0.1231783926486969, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8840842247009277, "rewards/final_brier_reward_step": 0.6149202585220337, "rewards/format_reward_step": 0.97265625, "step": 171 }, { "adv/mean_abs_final_conf": 0.3303182125091553, "adv/mean_abs_reasoning": 0.3266531229019165, "adv/mean_abs_step_conf": 0.7675945162773132, "adv/ratio_final_to_reasoning": 1.0112201272551105, "adv/ratio_step_to_reasoning": 2.34987655852841, "adv/std_final_conf": 0.6196299195289612, "adv/std_reasoning": 0.6185770034790039, "adv/std_step_conf": 0.9284144043922424, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.19921875, "calib/ece": 0.1666532258064517, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": -0.00093137254901976, "calib/mean_conf": 0.9892338709677421, "calib/mu_c": 0.9890686274509805, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1666532258064517, "calib/std_conf": 0.008070864436821137, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4193341121495327, "calib/step_q_c_n": 2568.0, "calib/step_q_gap": -0.0314531418335976, "calib/step_q_w": 0.4507872539831303, "calib/step_q_w_n": 1067.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 785.828125, "completions/mean_terminated_length": 807.919677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.18346666666666667, "grad_norm": 0.5395084023475647, "kl": 0.138580322265625, "learning_rate": 8.055555555555557e-07, "loss": -0.0653, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020166978240013123, "mask/share_reasoning": 0.7671674489974976, "mask/share_step_conf": 0.18532177805900574, "num_tokens": 54789154.0, "reward": 1.0124435424804688, "reward_std": 0.14000923931598663, "rewards/accuracy_reward_step": 0.796875, "rewards/asymmetric_l2_reward": 0.8716248273849487, "rewards/final_brier_reward_step": 0.8001371026039124, "rewards/format_reward_step": 0.96875, "step": 172 }, { "adv/mean_abs_final_conf": 0.3002045452594757, "adv/mean_abs_reasoning": 0.23893189430236816, "adv/mean_abs_step_conf": 0.7648637890815735, "adv/ratio_final_to_reasoning": 1.2564440010657056, "adv/ratio_step_to_reasoning": 3.2011791113732135, "adv/std_final_conf": 0.5756097435951233, "adv/std_reasoning": 0.5228158235549927, "adv/std_step_conf": 0.92622309923172, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 15.06640625, "calib/ece": 0.23287999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": -0.0012646370023423614, "calib/mean_conf": 0.98888, "calib/mu_c": 0.9885714285714284, "calib/mu_w": 0.9898360655737708, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23287999999999998, "calib/std_conf": 0.008691697187546282, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44958968772694263, "calib/step_q_c_n": 2754.0, "calib/step_q_gap": 0.05079549008415024, "calib/step_q_w": 0.3987941976427924, "calib/step_q_w_n": 1103.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 837.90625, "completions/mean_terminated_length": 858.0160522460938, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.18453333333333333, "grad_norm": 0.843133270740509, "kl": 0.1318817138671875, "learning_rate": 7.777777777777779e-07, "loss": -0.1294, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020681630820035934, "mask/share_reasoning": 0.7679648399353027, "mask/share_step_conf": 0.18791604042053223, "num_tokens": 55106818.0, "reward": 0.9849725961685181, "reward_std": 0.09927868843078613, "rewards/accuracy_reward_step": 0.73828125, "rewards/asymmetric_l2_reward": 0.8840460777282715, "rewards/final_brier_reward_step": 0.7429304718971252, "rewards/format_reward_step": 0.9765625, "step": 173 }, { "adv/mean_abs_final_conf": 0.5924447774887085, "adv/mean_abs_reasoning": 0.5510404706001282, "adv/mean_abs_step_conf": 0.7407152652740479, "adv/ratio_final_to_reasoning": 1.0751384137783704, "adv/ratio_step_to_reasoning": 1.344212094743873, "adv/std_final_conf": 0.8270944952964783, "adv/std_reasoning": 0.8099766373634338, "adv/std_step_conf": 0.9288783073425293, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 16.24609375, "calib/ece": 0.2835000000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.975, "calib/gap": 0.0004867072256026228, "calib/mean_conf": 0.9876666666666668, "calib/mu_c": 0.9878106508875741, "calib/mu_w": 0.9873239436619715, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2835000000000001, "calib/std_conf": 0.01406729856400612, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3551414743112435, "calib/step_q_c_n": 2686.0, "calib/step_q_gap": -0.0026928773520287774, "calib/step_q_w": 0.35783435166327227, "calib/step_q_w_n": 1473.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 839.08203125, "completions/mean_terminated_length": 891.30712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.1856, "grad_norm": 0.38210099935531616, "kl": 0.121185302734375, "learning_rate": 7.5e-07, "loss": -0.2701, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01721511036157608, "mask/share_reasoning": 0.7492451667785645, "mask/share_step_conf": 0.17494595050811768, "num_tokens": 55425855.0, "reward": 0.9197752475738525, "reward_std": 0.2335251122713089, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.8533145189285278, "rewards/final_brier_reward_step": 0.6667046546936035, "rewards/format_reward_step": 0.93359375, "step": 174 }, { "adv/mean_abs_final_conf": 0.5152873992919922, "adv/mean_abs_reasoning": 0.44962120056152344, "adv/mean_abs_step_conf": 0.7445969581604004, "adv/ratio_final_to_reasoning": 1.1460478256996323, "adv/ratio_step_to_reasoning": 1.656053934357382, "adv/std_final_conf": 0.7735191583633423, "adv/std_reasoning": 0.7207322716712952, "adv/std_step_conf": 0.9283451437950134, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 16.30078125, "calib/ece": 0.4540740740740742, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9753086419753086, "calib/gap": -0.00017971409121853021, "calib/mean_conf": 0.9871604938271606, "calib/mu_c": 0.9870769230769231, "calib/mu_w": 0.9872566371681416, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45312757201646103, "calib/std_conf": 0.015280739206682272, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36324510932105863, "calib/step_q_c_n": 1738.0, "calib/step_q_gap": 0.02264962677485738, "calib/step_q_w": 0.34059548254620126, "calib/step_q_w_n": 2435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 883.48046875, "completions/mean_terminated_length": 919.394287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.18666666666666668, "grad_norm": 0.698624849319458, "kl": 0.121673583984375, "learning_rate": 7.222222222222222e-07, "loss": -0.1475, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018637116998434067, "mask/share_reasoning": 0.755055844783783, "mask/share_step_conf": 0.18724456429481506, "num_tokens": 55757850.0, "reward": 0.8373793363571167, "reward_std": 0.187311589717865, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.8646679520606995, "rewards/final_brier_reward_step": 0.5186843872070312, "rewards/format_reward_step": 0.94921875, "step": 175 }, { "adv/mean_abs_final_conf": 0.4194459021091461, "adv/mean_abs_reasoning": 0.35693880915641785, "adv/mean_abs_step_conf": 0.7599928379058838, "adv/ratio_final_to_reasoning": 1.1751199122910068, "adv/ratio_step_to_reasoning": 2.12919642922, "adv/std_final_conf": 0.7031015753746033, "adv/std_reasoning": 0.6612206101417542, "adv/std_step_conf": 0.927830696105957, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.39453125, "calib/ece": 0.31756972111553794, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9681274900398407, "calib/gap": 0.00399741824440647, "calib/mean_conf": 0.9868924302788845, "calib/mu_c": 0.9882142857142858, "calib/mu_w": 0.9842168674698794, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31756972111553794, "calib/std_conf": 0.015864980053618485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37149156118143456, "calib/step_q_c_n": 2370.0, "calib/step_q_gap": 0.08980473750224932, "calib/step_q_w": 0.28168682367918524, "calib/step_q_w_n": 1571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 885.63671875, "completions/mean_terminated_length": 903.2789306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.18773333333333334, "grad_norm": 0.48703455924987793, "kl": 0.12347412109375, "learning_rate": 6.944444444444446e-07, "loss": -0.0846, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019089844077825546, "mask/share_reasoning": 0.7704373002052307, "mask/share_step_conf": 0.19094160199165344, "num_tokens": 56088637.0, "reward": 0.9523155689239502, "reward_std": 0.14043554663658142, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.9112182259559631, "rewards/final_brier_reward_step": 0.6660691499710083, "rewards/format_reward_step": 0.98046875, "step": 176 }, { "adv/mean_abs_final_conf": 0.3711525499820709, "adv/mean_abs_reasoning": 0.2696026861667633, "adv/mean_abs_step_conf": 0.7205239534378052, "adv/ratio_final_to_reasoning": 1.376664881419222, "adv/ratio_step_to_reasoning": 2.6725399649472465, "adv/std_final_conf": 0.6609807014465332, "adv/std_reasoning": 0.5726442933082581, "adv/std_step_conf": 0.9247685074806213, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.83984375, "calib/ece": 0.3177380952380955, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9761904761904762, "calib/gap": -0.001751621872103648, "calib/mean_conf": 0.9875, "calib/mu_c": 0.9869230769230769, "calib/mu_w": 0.9886746987951806, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31730158730158753, "calib/std_conf": 0.014215601757693312, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3016305849838784, "calib/step_q_c_n": 2171.0, "calib/step_q_gap": 0.021427882281175703, "calib/step_q_w": 0.2802027027027027, "calib/step_q_w_n": 1628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 832.15234375, "completions/mean_terminated_length": 845.3611450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.1888, "grad_norm": 0.6601170897483826, "kl": 0.131927490234375, "learning_rate": 6.666666666666667e-07, "loss": -0.0118, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01938062533736229, "mask/share_reasoning": 0.7758159041404724, "mask/share_step_conf": 0.189178466796875, "num_tokens": 56405500.0, "reward": 0.9486652612686157, "reward_std": 0.11378754675388336, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.9012770652770996, "rewards/final_brier_reward_step": 0.6671472787857056, "rewards/format_reward_step": 0.984375, "step": 177 }, { "adv/mean_abs_final_conf": 0.34973353147506714, "adv/mean_abs_reasoning": 0.27946963906288147, "adv/mean_abs_step_conf": 0.764250636100769, "adv/ratio_final_to_reasoning": 1.251418696670575, "adv/ratio_step_to_reasoning": 2.7346463775580663, "adv/std_final_conf": 0.6462281346321106, "adv/std_reasoning": 0.5726738572120667, "adv/std_step_conf": 0.9253974556922913, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.21484375, "calib/ece": 0.2770355731225298, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.00135464231354665, "calib/mean_conf": 0.9884980237154152, "calib/mu_c": 0.9888888888888889, "calib/mu_w": 0.9875342465753423, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2770355731225298, "calib/std_conf": 0.011286644143200747, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32114075792730085, "calib/step_q_c_n": 2586.0, "calib/step_q_gap": 0.09129354631538336, "calib/step_q_w": 0.22984721161191748, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 820.8671875, "completions/mean_terminated_length": 833.8968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.18986666666666666, "grad_norm": 0.5690588355064392, "kl": 0.1266937255859375, "learning_rate": 6.388888888888889e-07, "loss": -0.0393, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01859363354742527, "mask/share_reasoning": 0.7699024677276611, "mask/share_step_conf": 0.19587887823581696, "num_tokens": 56721714.0, "reward": 0.9786163568496704, "reward_std": 0.1166824921965599, "rewards/accuracy_reward_step": 0.703125, "rewards/asymmetric_l2_reward": 0.9098326563835144, "rewards/final_brier_reward_step": 0.7099000215530396, "rewards/format_reward_step": 0.984375, "step": 178 }, { "adv/mean_abs_final_conf": 0.49030888080596924, "adv/mean_abs_reasoning": 0.38043731451034546, "adv/mean_abs_step_conf": 0.7395304441452026, "adv/ratio_final_to_reasoning": 1.2888033379087371, "adv/ratio_step_to_reasoning": 1.9438956588604879, "adv/std_final_conf": 0.7575724720954895, "adv/std_reasoning": 0.6816080212593079, "adv/std_step_conf": 0.9267113208770752, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.6953125, "calib/ece": 0.23162601626016258, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9552845528455285, "calib/gap": 0.006102150537634299, "calib/mean_conf": 0.9854471544715447, "calib/mu_c": 0.9869354838709677, "calib/mu_w": 0.9808333333333334, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23048780487804876, "calib/std_conf": 0.02057119434107837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31232476635514017, "calib/step_q_c_n": 2568.0, "calib/step_q_gap": 0.045599473222811826, "calib/step_q_w": 0.26672529313232834, "calib/step_q_w_n": 1194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 821.94921875, "completions/mean_terminated_length": 845.0562133789062, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.19093333333333334, "grad_norm": 0.6852741837501526, "kl": 0.1256866455078125, "learning_rate": 6.111111111111112e-07, "loss": -0.1584, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01824834942817688, "mask/share_reasoning": 0.7696465253829956, "mask/share_step_conf": 0.1847614198923111, "num_tokens": 57038397.0, "reward": 0.9789311289787292, "reward_std": 0.1612911820411682, "rewards/accuracy_reward_step": 0.7265625, "rewards/asymmetric_l2_reward": 0.8854255080223083, "rewards/final_brier_reward_step": 0.7349367141723633, "rewards/format_reward_step": 0.9609375, "step": 179 }, { "adv/mean_abs_final_conf": 0.399747759103775, "adv/mean_abs_reasoning": 0.28760388493537903, "adv/mean_abs_step_conf": 0.7380611896514893, "adv/ratio_final_to_reasoning": 1.3899247542973674, "adv/ratio_step_to_reasoning": 2.5662420722074817, "adv/std_final_conf": 0.6938978433609009, "adv/std_reasoning": 0.5961751341819763, "adv/std_step_conf": 0.9286543130874634, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 16.7734375, "calib/ece": 0.2512295081967214, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9467213114754098, "calib/gap": 0.0028173614095401422, "calib/mean_conf": 0.9848360655737706, "calib/mu_c": 0.9855865921787711, "calib/mu_w": 0.982769230769231, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2512295081967214, "calib/std_conf": 0.020354787834781736, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30738998211091234, "calib/step_q_c_n": 2795.0, "calib/step_q_gap": -0.034124360784351215, "calib/step_q_w": 0.34151434289526356, "calib/step_q_w_n": 1499.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 941.8203125, "completions/mean_terminated_length": 980.1056518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.192, "grad_norm": 0.8363396525382996, "kl": 0.10614013671875, "learning_rate": 5.833333333333334e-07, "loss": -0.1311, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015935346484184265, "mask/share_reasoning": 0.7640693187713623, "mask/share_step_conf": 0.180932879447937, "num_tokens": 57383359.0, "reward": 0.9518052339553833, "reward_std": 0.13234972953796387, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8657869100570679, "rewards/final_brier_reward_step": 0.7073546648025513, "rewards/format_reward_step": 0.953125, "step": 180 }, { "adv/mean_abs_final_conf": 0.4768427610397339, "adv/mean_abs_reasoning": 0.3964908719062805, "adv/mean_abs_step_conf": 0.7462477684020996, "adv/ratio_final_to_reasoning": 1.2026576015410673, "adv/ratio_step_to_reasoning": 1.8821310180843003, "adv/std_final_conf": 0.7516286373138428, "adv/std_reasoning": 0.6815335154533386, "adv/std_step_conf": 0.9278928637504578, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.6796875, "calib/ece": 0.32225296442687756, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9644268774703557, "calib/gap": 0.0013144257703082696, "calib/mean_conf": 0.9862845849802373, "calib/mu_c": 0.9867261904761904, "calib/mu_w": 0.9854117647058821, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32225296442687756, "calib/std_conf": 0.017457742303337357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3037671232876713, "calib/step_q_c_n": 2336.0, "calib/step_q_gap": 0.030483016395969453, "calib/step_q_w": 0.27328410689170185, "calib/step_q_w_n": 1422.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 843.3203125, "completions/mean_terminated_length": 853.3201904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.19306666666666666, "grad_norm": 0.4284789264202118, "kl": 0.12884521484375, "learning_rate": 5.555555555555555e-07, "loss": -0.0719, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018367484211921692, "mask/share_reasoning": 0.7802478075027466, "mask/share_step_conf": 0.1896660029888153, "num_tokens": 57705513.0, "reward": 0.9494215250015259, "reward_std": 0.14175282418727875, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.9044862985610962, "rewards/final_brier_reward_step": 0.6654503345489502, "rewards/format_reward_step": 0.98828125, "step": 181 }, { "adv/mean_abs_final_conf": 0.4217997193336487, "adv/mean_abs_reasoning": 0.2968176007270813, "adv/mean_abs_step_conf": 0.7443047761917114, "adv/ratio_final_to_reasoning": 1.421073811999061, "adv/ratio_step_to_reasoning": 2.5076167126493516, "adv/std_final_conf": 0.7138107419013977, "adv/std_reasoning": 0.595967173576355, "adv/std_step_conf": 0.9284963607788086, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.921875, "calib/ece": 0.23496062992125982, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9488188976377953, "calib/gap": -0.0021657109615228487, "calib/mean_conf": 0.9851968503937008, "calib/mu_c": 0.9846596858638743, "calib/mu_w": 0.9868253968253972, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23409448818897635, "calib/std_conf": 0.02019005733713578, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25033546928006034, "calib/step_q_c_n": 2653.0, "calib/step_q_gap": 0.02101241872307663, "calib/step_q_w": 0.2293230505569837, "calib/step_q_w_n": 1167.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 867.2265625, "completions/mean_terminated_length": 874.0551147460938, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.19413333333333332, "grad_norm": 0.6338346004486084, "kl": 0.1259765625, "learning_rate": 5.277777777777779e-07, "loss": -0.0303, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018378376960754395, "mask/share_reasoning": 0.7847402095794678, "mask/share_step_conf": 0.18906891345977783, "num_tokens": 58033683.0, "reward": 1.003204345703125, "reward_std": 0.10716600716114044, "rewards/accuracy_reward_step": 0.74609375, "rewards/asymmetric_l2_reward": 0.9067959785461426, "rewards/final_brier_reward_step": 0.7519562244415283, "rewards/format_reward_step": 0.9921875, "step": 182 }, { "adv/mean_abs_final_conf": 0.5021800994873047, "adv/mean_abs_reasoning": 0.4527639150619507, "adv/mean_abs_step_conf": 0.742629885673523, "adv/ratio_final_to_reasoning": 1.1091433808690176, "adv/ratio_step_to_reasoning": 1.6402143831888867, "adv/std_final_conf": 0.7453385591506958, "adv/std_reasoning": 0.7207216024398804, "adv/std_step_conf": 0.9296694993972778, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 17.66796875, "calib/ece": 0.25096385542168675, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9598393574297188, "calib/gap": 0.005558867362145836, "calib/mean_conf": 0.9859036144578314, "calib/mu_c": 0.9873770491803279, "calib/mu_w": 0.981818181818182, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25096385542168675, "calib/std_conf": 0.018105190933622412, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28109772423025436, "calib/step_q_c_n": 2988.0, "calib/step_q_gap": -0.05104559824531574, "calib/step_q_w": 0.3321433224755701, "calib/step_q_w_n": 1535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 941.84375, "completions/mean_terminated_length": 960.6055908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.1952, "grad_norm": 0.6020494103431702, "kl": 0.1154937744140625, "learning_rate": 5.000000000000001e-07, "loss": -0.0921, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017140112817287445, "mask/share_reasoning": 0.7689451575279236, "mask/share_step_conf": 0.19438347220420837, "num_tokens": 58381475.0, "reward": 0.9739905595779419, "reward_std": 0.18247857689857483, "rewards/accuracy_reward_step": 0.71484375, "rewards/asymmetric_l2_reward": 0.8867743611335754, "rewards/final_brier_reward_step": 0.7237066030502319, "rewards/format_reward_step": 0.97265625, "step": 183 }, { "adv/mean_abs_final_conf": 0.5241888761520386, "adv/mean_abs_reasoning": 0.3954259753227234, "adv/mean_abs_step_conf": 0.7569580078125, "adv/ratio_final_to_reasoning": 1.3256308610587013, "adv/ratio_step_to_reasoning": 1.914284986449652, "adv/std_final_conf": 0.8009321689605713, "adv/std_reasoning": 0.7012929916381836, "adv/std_step_conf": 0.9290881156921387, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.91796875, "calib/ece": 0.2717857142857144, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": -0.00024999999999986144, "calib/mean_conf": 0.9860714285714287, "calib/mu_c": 0.986, "calib/mu_w": 0.9862499999999998, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2717857142857144, "calib/std_conf": 0.018388522374079997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28987043580683153, "calib/step_q_c_n": 2547.0, "calib/step_q_gap": 0.043777668511234075, "calib/step_q_w": 0.24609276729559745, "calib/step_q_w_n": 1272.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 861.8125, "completions/mean_terminated_length": 872.0316772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.19626666666666667, "grad_norm": 0.8534870743751526, "kl": 0.124298095703125, "learning_rate": 4.7222222222222226e-07, "loss": -0.0709, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01848047785460949, "mask/share_reasoning": 0.7864163517951965, "mask/share_step_conf": 0.18338441848754883, "num_tokens": 58707379.0, "reward": 0.9695956707000732, "reward_std": 0.148184671998024, "rewards/accuracy_reward_step": 0.703125, "rewards/asymmetric_l2_reward": 0.8913558125495911, "rewards/final_brier_reward_step": 0.7103354930877686, "rewards/format_reward_step": 0.984375, "step": 184 }, { "adv/mean_abs_final_conf": 0.39847496151924133, "adv/mean_abs_reasoning": 0.3419169783592224, "adv/mean_abs_step_conf": 0.744712769985199, "adv/ratio_final_to_reasoning": 1.1654143746573427, "adv/ratio_step_to_reasoning": 2.178051448509217, "adv/std_final_conf": 0.6983887553215027, "adv/std_reasoning": 0.6401586532592773, "adv/std_step_conf": 0.9285353422164917, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.59765625, "calib/ece": 0.26328000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.972, "calib/gap": 0.0016478501080954278, "calib/mean_conf": 0.98728, "calib/mu_c": 0.9877348066298343, "calib/mu_w": 0.9860869565217388, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26328000000000007, "calib/std_conf": 0.014879569886256789, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3357124836885602, "calib/step_q_c_n": 2299.0, "calib/step_q_gap": 0.05943988285406787, "calib/step_q_w": 0.27627260083449234, "calib/step_q_w_n": 1438.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 834.06640625, "completions/mean_terminated_length": 850.6812744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.19733333333333333, "grad_norm": 0.6532776355743408, "kl": 0.118072509765625, "learning_rate": 4.444444444444445e-07, "loss": -0.002, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019208751618862152, "mask/share_reasoning": 0.777682900428772, "mask/share_step_conf": 0.18357715010643005, "num_tokens": 59027820.0, "reward": 0.9746893644332886, "reward_std": 0.12995830178260803, "rewards/accuracy_reward_step": 0.70703125, "rewards/asymmetric_l2_reward": 0.8985029458999634, "rewards/final_brier_reward_step": 0.714156985282898, "rewards/format_reward_step": 0.9765625, "step": 185 }, { "adv/mean_abs_final_conf": 0.3600738048553467, "adv/mean_abs_reasoning": 0.31867513060569763, "adv/mean_abs_step_conf": 0.7509428262710571, "adv/ratio_final_to_reasoning": 1.1299087072498053, "adv/ratio_step_to_reasoning": 2.356452556695464, "adv/std_final_conf": 0.6548113226890564, "adv/std_reasoning": 0.6184834241867065, "adv/std_step_conf": 0.9291680455207825, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 15.3828125, "calib/ece": 0.26072000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.944, "calib/gap": 0.005518456241492609, "calib/mean_conf": 0.98472, "calib/mu_c": 0.9862430939226519, "calib/mu_w": 0.9807246376811593, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26072000000000006, "calib/std_conf": 0.020806768129625508, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32381660470879803, "calib/step_q_c_n": 2421.0, "calib/step_q_gap": 0.06067883279053832, "calib/step_q_w": 0.2631377719182597, "calib/step_q_w_n": 1517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 914.3203125, "completions/mean_terminated_length": 928.8333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.1984, "grad_norm": 1.2435097694396973, "kl": 0.117523193359375, "learning_rate": 4.1666666666666667e-07, "loss": -0.0775, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018042773008346558, "mask/share_reasoning": 0.7825154066085815, "mask/share_step_conf": 0.1838168203830719, "num_tokens": 59366926.0, "reward": 0.9796261787414551, "reward_std": 0.12158802151679993, "rewards/accuracy_reward_step": 0.70703125, "rewards/asymmetric_l2_reward": 0.9057626128196716, "rewards/final_brier_reward_step": 0.716771125793457, "rewards/format_reward_step": 0.9765625, "step": 186 }, { "adv/mean_abs_final_conf": 0.5800182819366455, "adv/mean_abs_reasoning": 0.4727820158004761, "adv/mean_abs_step_conf": 0.7499408721923828, "adv/ratio_final_to_reasoning": 1.2268196812744785, "adv/ratio_step_to_reasoning": 1.5862296938741292, "adv/std_final_conf": 0.8018984198570251, "adv/std_reasoning": 0.7206127047538757, "adv/std_step_conf": 0.9290520548820496, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 16.41015625, "calib/ece": 0.2889068825910932, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9473684210526315, "calib/gap": -0.0016325581395346678, "calib/mean_conf": 0.985263157894737, "calib/mu_c": 0.984767441860465, "calib/mu_w": 0.9863999999999997, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2889068825910932, "calib/std_conf": 0.020096719044249235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34531007751937987, "calib/step_q_c_n": 2451.0, "calib/step_q_gap": -0.011569922480620165, "calib/step_q_w": 0.35688000000000003, "calib/step_q_w_n": 1750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 881.44140625, "completions/mean_terminated_length": 913.5587158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.19946666666666665, "grad_norm": 0.9146304726600647, "kl": 0.112152099609375, "learning_rate": 3.8888888888888895e-07, "loss": -0.1054, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017671627923846245, "mask/share_reasoning": 0.7644854187965393, "mask/share_step_conf": 0.1826866865158081, "num_tokens": 59694119.0, "reward": 0.947091281414032, "reward_std": 0.16564065217971802, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8875942826271057, "rewards/final_brier_reward_step": 0.6792445182800293, "rewards/format_reward_step": 0.96484375, "step": 187 }, { "adv/mean_abs_final_conf": 0.3474765717983246, "adv/mean_abs_reasoning": 0.2792311906814575, "adv/mean_abs_step_conf": 0.7490344047546387, "adv/ratio_final_to_reasoning": 1.2444045772620018, "adv/ratio_step_to_reasoning": 2.6824883098719625, "adv/std_final_conf": 0.6373057961463928, "adv/std_reasoning": 0.5726369619369507, "adv/std_step_conf": 0.9287593364715576, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.7109375, "calib/ece": 0.2517254901960785, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9450980392156862, "calib/gap": 0.004090909090909034, "calib/mean_conf": 0.9850588235294118, "calib/mu_c": 0.9861497326203208, "calib/mu_w": 0.9820588235294118, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2517254901960785, "calib/std_conf": 0.020500991620881318, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.324676343664182, "calib/step_q_c_n": 2549.0, "calib/step_q_gap": 0.005407650155554222, "calib/step_q_w": 0.3192686935086278, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 914.375, "completions/mean_terminated_length": 921.5748291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.20053333333333334, "grad_norm": 0.4690471887588501, "kl": 0.1195526123046875, "learning_rate": 3.611111111111111e-07, "loss": -0.0428, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018977079540491104, "mask/share_reasoning": 0.7888655662536621, "mask/share_step_conf": 0.18434487283229828, "num_tokens": 60032271.0, "reward": 0.998010516166687, "reward_std": 0.10548588633537292, "rewards/accuracy_reward_step": 0.73046875, "rewards/asymmetric_l2_reward": 0.9113494157791138, "rewards/final_brier_reward_step": 0.7393589615821838, "rewards/format_reward_step": 0.99609375, "step": 188 }, { "adv/mean_abs_final_conf": 0.45262610912323, "adv/mean_abs_reasoning": 0.33596310019493103, "adv/mean_abs_step_conf": 0.7510579824447632, "adv/ratio_final_to_reasoning": 1.3472494713276824, "adv/ratio_step_to_reasoning": 2.2355371230024597, "adv/std_final_conf": 0.7394607663154602, "adv/std_reasoning": 0.6401766538619995, "adv/std_step_conf": 0.9280656576156616, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.3828125, "calib/ece": 0.28917322834645665, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9566929133858267, "calib/gap": 0.0010030082911441918, "calib/mean_conf": 0.986023622047244, "calib/mu_c": 0.9863276836158192, "calib/mu_w": 0.985324675324675, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28917322834645665, "calib/std_conf": 0.018323842979201792, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3182495590828924, "calib/step_q_c_n": 2268.0, "calib/step_q_gap": 0.036417875914575515, "calib/step_q_w": 0.28183168316831686, "calib/step_q_w_n": 1414.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 824.72265625, "completions/mean_terminated_length": 834.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.2016, "grad_norm": 0.5540195107460022, "kl": 0.1278076171875, "learning_rate": 3.3333333333333335e-07, "loss": -0.007, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019671592861413956, "mask/share_reasoning": 0.7858209609985352, "mask/share_step_conf": 0.18278874456882477, "num_tokens": 60351168.0, "reward": 0.96614009141922, "reward_std": 0.12750448286533356, "rewards/accuracy_reward_step": 0.69140625, "rewards/asymmetric_l2_reward": 0.8958539366722107, "rewards/final_brier_reward_step": 0.6997073888778687, "rewards/format_reward_step": 0.9921875, "step": 189 }, { "adv/mean_abs_final_conf": 0.3846921920776367, "adv/mean_abs_reasoning": 0.2818589210510254, "adv/mean_abs_step_conf": 0.7572779655456543, "adv/ratio_final_to_reasoning": 1.3648395113525438, "adv/ratio_step_to_reasoning": 2.686726972209487, "adv/std_final_conf": 0.6936730146408081, "adv/std_reasoning": 0.595977246761322, "adv/std_step_conf": 0.9287237524986267, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.0859375, "calib/ece": 0.2838492063492064, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9642857142857143, "calib/gap": -0.0021401129943501296, "calib/mean_conf": 0.9862301587301587, "calib/mu_c": 0.985593220338983, "calib/mu_w": 0.9877333333333331, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2838492063492064, "calib/std_conf": 0.017470911288328544, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3007166600079904, "calib/step_q_c_n": 2503.0, "calib/step_q_gap": -0.022756482008197965, "calib/step_q_w": 0.3234731420161884, "calib/step_q_w_n": 1359.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 908.140625, "completions/mean_terminated_length": 918.9091186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.20266666666666666, "grad_norm": 0.7370194792747498, "kl": 0.1108551025390625, "learning_rate": 3.055555555555556e-07, "loss": -0.0538, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017489243298768997, "mask/share_reasoning": 0.7873534560203552, "mask/share_step_conf": 0.1834385246038437, "num_tokens": 60689260.0, "reward": 0.9662101864814758, "reward_std": 0.11586734652519226, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8983762860298157, "rewards/final_brier_reward_step": 0.698106586933136, "rewards/format_reward_step": 0.984375, "step": 190 }, { "adv/mean_abs_final_conf": 0.3627047836780548, "adv/mean_abs_reasoning": 0.25704121589660645, "adv/mean_abs_step_conf": 0.7288926839828491, "adv/ratio_final_to_reasoning": 1.4110763614810746, "adv/ratio_step_to_reasoning": 2.8357035327596747, "adv/std_final_conf": 0.6555309891700745, "adv/std_reasoning": 0.5482824444770813, "adv/std_step_conf": 0.9267686009407043, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.234375, "calib/ece": 0.3539840637450199, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9721115537848606, "calib/gap": 0.0023236259228875955, "calib/mean_conf": 0.9874501992031872, "calib/mu_c": 0.9883018867924528, "calib/mu_w": 0.9859782608695652, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3539840637450199, "calib/std_conf": 0.0148254287160171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3925788336933045, "calib/step_q_c_n": 2315.0, "calib/step_q_gap": 0.08292583684787863, "calib/step_q_w": 0.30965299684542585, "calib/step_q_w_n": 1585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 844.7109375, "completions/mean_terminated_length": 858.1190795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.20373333333333332, "grad_norm": 0.6378164291381836, "kl": 0.1242523193359375, "learning_rate": 2.7777777777777776e-07, "loss": -0.1195, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01911262795329094, "mask/share_reasoning": 0.7696329951286316, "mask/share_step_conf": 0.19562937319278717, "num_tokens": 61009674.0, "reward": 0.9256492853164673, "reward_std": 0.10838496685028076, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.8994027972221375, "rewards/final_brier_reward_step": 0.6308019161224365, "rewards/format_reward_step": 0.98046875, "step": 191 }, { "adv/mean_abs_final_conf": 0.45994052290916443, "adv/mean_abs_reasoning": 0.305791437625885, "adv/mean_abs_step_conf": 0.7275489568710327, "adv/ratio_final_to_reasoning": 1.5040987624770263, "adv/ratio_step_to_reasoning": 2.379232598922993, "adv/std_final_conf": 0.7334050536155701, "adv/std_reasoning": 0.5961174964904785, "adv/std_step_conf": 0.926038384437561, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.58984375, "calib/ece": 0.2882539682539683, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9682539682539683, "calib/gap": 0.002763157894737245, "calib/mean_conf": 0.9866666666666667, "calib/mu_c": 0.9875, "calib/mu_w": 0.9847368421052628, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2882539682539683, "calib/std_conf": 0.016354214058166085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3370448662640207, "calib/step_q_c_n": 2318.0, "calib/step_q_gap": 0.052514167604881645, "calib/step_q_w": 0.28453069865913905, "calib/step_q_w_n": 1417.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2693.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 889.5859375, "completions/mean_terminated_length": 900.1343994140625, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.2048, "grad_norm": 0.5918493866920471, "kl": 0.12744140625, "learning_rate": 2.5000000000000004e-07, "loss": -0.0365, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020594235509634018, "mask/share_reasoning": 0.7800520658493042, "mask/share_step_conf": 0.18763500452041626, "num_tokens": 61342384.0, "reward": 0.9666069149971008, "reward_std": 0.1264154016971588, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.9027146100997925, "rewards/final_brier_reward_step": 0.6961241960525513, "rewards/format_reward_step": 0.984375, "step": 192 }, { "adv/mean_abs_final_conf": 0.6095333099365234, "adv/mean_abs_reasoning": 0.5772269368171692, "adv/mean_abs_step_conf": 0.7257479429244995, "adv/ratio_final_to_reasoning": 1.0559682354699031, "adv/ratio_step_to_reasoning": 1.2573008926545866, "adv/std_final_conf": 0.8253099918365479, "adv/std_reasoning": 0.8099737763404846, "adv/std_step_conf": 0.9309096336364746, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 16.1640625, "calib/ece": 0.3582786885245902, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9508196721311475, "calib/gap": 0.005517489046900859, "calib/mean_conf": 0.985327868852459, "calib/mu_c": 0.9873856209150327, "calib/mu_w": 0.9818681318681318, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3582786885245902, "calib/std_conf": 0.01961449891252278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3347565543071161, "calib/step_q_c_n": 2136.0, "calib/step_q_gap": -0.007326362775800954, "calib/step_q_w": 0.34208291708291705, "calib/step_q_w_n": 2002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 827.8046875, "completions/mean_terminated_length": 864.9713745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.20586666666666667, "grad_norm": 0.7676878571510315, "kl": 0.123687744140625, "learning_rate": 2.2222222222222224e-07, "loss": -0.2038, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01849346235394478, "mask/share_reasoning": 0.7518797516822815, "mask/share_step_conf": 0.18665803968906403, "num_tokens": 61660014.0, "reward": 0.8981163501739502, "reward_std": 0.23210570216178894, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8745388984680176, "rewards/final_brier_reward_step": 0.6099749803543091, "rewards/format_reward_step": 0.953125, "step": 193 }, { "adv/mean_abs_final_conf": 0.4632437825202942, "adv/mean_abs_reasoning": 0.3431586027145386, "adv/mean_abs_step_conf": 0.731201708316803, "adv/ratio_final_to_reasoning": 1.3499407529224912, "adv/ratio_step_to_reasoning": 2.1307981281327915, "adv/std_final_conf": 0.7352818846702576, "adv/std_reasoning": 0.6402397155761719, "adv/std_step_conf": 0.9296265840530396, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.3125, "calib/ece": 0.3079116465863454, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.963855421686747, "calib/gap": 0.003501479289940823, "calib/mean_conf": 0.9866265060240964, "calib/mu_c": 0.9877514792899409, "calib/mu_w": 0.9842500000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3079116465863454, "calib/std_conf": 0.01681057530273725, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3543109131403118, "calib/step_q_c_n": 2245.0, "calib/step_q_gap": 0.046907928065684945, "calib/step_q_w": 0.3074029850746269, "calib/step_q_w_n": 1675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 840.77734375, "completions/mean_terminated_length": 857.5259399414062, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.20693333333333333, "grad_norm": 0.43501731753349304, "kl": 0.1187896728515625, "learning_rate": 1.9444444444444447e-07, "loss": -0.1175, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018635708838701248, "mask/share_reasoning": 0.7715604305267334, "mask/share_step_conf": 0.19027259945869446, "num_tokens": 61981197.0, "reward": 0.938804030418396, "reward_std": 0.14546942710876465, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.8861815333366394, "rewards/final_brier_reward_step": 0.6656453013420105, "rewards/format_reward_step": 0.96875, "step": 194 }, { "adv/mean_abs_final_conf": 0.4656180143356323, "adv/mean_abs_reasoning": 0.34245651960372925, "adv/mean_abs_step_conf": 0.7472131848335266, "adv/ratio_final_to_reasoning": 1.3596412615371387, "adv/ratio_step_to_reasoning": 2.1819213303287617, "adv/std_final_conf": 0.7315597534179688, "adv/std_reasoning": 0.6185890436172485, "adv/std_step_conf": 0.9277791380882263, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 15.4921875, "calib/ece": 0.35260000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.94, "calib/gap": 0.000742982938910508, "calib/mean_conf": 0.9846000000000001, "calib/mu_c": 0.9848734177215189, "calib/mu_w": 0.9841304347826084, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35260000000000014, "calib/std_conf": 0.021373815756668243, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34211645101663585, "calib/step_q_c_n": 2164.0, "calib/step_q_gap": 0.05884231117201871, "calib/step_q_w": 0.28327413984461713, "calib/step_q_w_n": 1802.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 847.4765625, "completions/mean_terminated_length": 867.8160400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.208, "grad_norm": 0.767591655254364, "kl": 0.121429443359375, "learning_rate": 1.6666666666666668e-07, "loss": -0.1218, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01915118098258972, "mask/share_reasoning": 0.7693878412246704, "mask/share_step_conf": 0.18802347779273987, "num_tokens": 62304135.0, "reward": 0.9228836297988892, "reward_std": 0.13523530960083008, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8991012573242188, "rewards/final_brier_reward_step": 0.6279159784317017, "rewards/format_reward_step": 0.9765625, "step": 195 }, { "adv/mean_abs_final_conf": 0.3352336287498474, "adv/mean_abs_reasoning": 0.25784218311309814, "adv/mean_abs_step_conf": 0.7582453489303589, "adv/ratio_final_to_reasoning": 1.3001504435866602, "adv/ratio_step_to_reasoning": 2.9407342886084984, "adv/std_final_conf": 0.6322796940803528, "adv/std_reasoning": 0.5483107566833496, "adv/std_step_conf": 0.9287676215171814, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.9765625, "calib/ece": 0.2836363636363637, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": -0.0022831460674155313, "calib/mean_conf": 0.9871936758893282, "calib/mu_c": 0.9865168539325843, "calib/mu_w": 0.9887999999999998, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2836363636363637, "calib/std_conf": 0.015541288865804125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38224508050089445, "calib/step_q_c_n": 2236.0, "calib/step_q_gap": 0.04490622230568264, "calib/step_q_w": 0.3373388581952118, "calib/step_q_w_n": 1086.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 726.90234375, "completions/mean_terminated_length": 732.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.20906666666666668, "grad_norm": 0.5676056742668152, "kl": 0.1354522705078125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0393, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02158435806632042, "mask/share_reasoning": 0.7757572531700134, "mask/share_step_conf": 0.19484591484069824, "num_tokens": 62592766.0, "reward": 0.9680054187774658, "reward_std": 0.10587546229362488, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.8978179693222046, "rewards/final_brier_reward_step": 0.7014741897583008, "rewards/format_reward_step": 0.98828125, "step": 196 }, { "adv/mean_abs_final_conf": 0.5880240201950073, "adv/mean_abs_reasoning": 0.49583712220191956, "adv/mean_abs_step_conf": 0.7577347755432129, "adv/ratio_final_to_reasoning": 1.1859217349110591, "adv/ratio_step_to_reasoning": 1.5281929117736386, "adv/std_final_conf": 0.8244036436080933, "adv/std_reasoning": 0.7576045393943787, "adv/std_step_conf": 0.929575502872467, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.15625, "calib/ece": 0.3315476190476192, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9603174603174603, "calib/gap": 0.0024388714733541983, "calib/mean_conf": 0.986309523809524, "calib/mu_c": 0.9871515151515151, "calib/mu_w": 0.9847126436781609, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3315476190476192, "calib/std_conf": 0.017578379706519476, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32334044368600684, "calib/step_q_c_n": 2344.0, "calib/step_q_gap": -0.0184499208973265, "calib/step_q_w": 0.34179036458333334, "calib/step_q_w_n": 1536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 854.32421875, "completions/mean_terminated_length": 864.45458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.21013333333333334, "grad_norm": 1.0512775182724, "kl": 0.12310791015625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0641, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018566645681858063, "mask/share_reasoning": 0.7808510065078735, "mask/share_step_conf": 0.188863605260849, "num_tokens": 62916529.0, "reward": 0.9386444091796875, "reward_std": 0.194078266620636, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8970743417739868, "rewards/final_brier_reward_step": 0.6544331908226013, "rewards/format_reward_step": 0.984375, "step": 197 }, { "adv/mean_abs_final_conf": 0.47785258293151855, "adv/mean_abs_reasoning": 0.4552566409111023, "adv/mean_abs_step_conf": 0.7427797317504883, "adv/ratio_final_to_reasoning": 1.0496334155064606, "adv/ratio_step_to_reasoning": 1.6315626506050913, "adv/std_final_conf": 0.7191026210784912, "adv/std_reasoning": 0.7015376091003418, "adv/std_step_conf": 0.9298596978187561, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.83984375, "calib/ece": 0.2682186234817814, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9878542510121457, "calib/gap": -0.0013719263963521966, "calib/mean_conf": 0.9888663967611336, "calib/mu_c": 0.9884831460674157, "calib/mu_w": 0.9898550724637679, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2682186234817814, "calib/std_conf": 0.00987422750386308, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34472174657534244, "calib/step_q_c_n": 2336.0, "calib/step_q_gap": -0.020862669009073076, "calib/step_q_w": 0.3655844155844155, "calib/step_q_w_n": 1463.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 790.015625, "completions/mean_terminated_length": 818.8016357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.2112, "grad_norm": 0.9970131516456604, "kl": 0.1248321533203125, "learning_rate": 8.333333333333334e-08, "loss": -0.1528, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.020461376756429672, "mask/share_reasoning": 0.7594587802886963, "mask/share_step_conf": 0.18492358922958374, "num_tokens": 63224157.0, "reward": 0.9614353179931641, "reward_std": 0.18354573845863342, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.890271782875061, "rewards/final_brier_reward_step": 0.7005675435066223, "rewards/format_reward_step": 0.96484375, "step": 198 }, { "adv/mean_abs_final_conf": 0.3442460894584656, "adv/mean_abs_reasoning": 0.30875712633132935, "adv/mean_abs_step_conf": 0.7558103799819946, "adv/ratio_final_to_reasoning": 1.1149413571398925, "adv/ratio_step_to_reasoning": 2.4479123412067563, "adv/std_final_conf": 0.6221913695335388, "adv/std_reasoning": 0.5962019562721252, "adv/std_step_conf": 0.928808331489563, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.38671875, "calib/ece": 0.2816935483870968, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9758064516129032, "calib/gap": 0.0032172211350293134, "calib/mean_conf": 0.9873387096774194, "calib/mu_c": 0.9882857142857142, "calib/mu_w": 0.9850684931506849, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2816935483870968, "calib/std_conf": 0.014541867861994644, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3489337919174549, "calib/step_q_c_n": 2326.0, "calib/step_q_gap": 0.017179923089157167, "calib/step_q_w": 0.3317538688282977, "calib/step_q_w_n": 1357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 828.625, "completions/mean_terminated_length": 855.3547973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.21226666666666666, "grad_norm": 0.8685548305511475, "kl": 0.128143310546875, "learning_rate": 5.555555555555556e-08, "loss": -0.1399, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0195816308259964, "mask/share_reasoning": 0.7623173594474792, "mask/share_step_conf": 0.18685100972652435, "num_tokens": 63540485.0, "reward": 0.9592928290367126, "reward_std": 0.1366279572248459, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.8963676691055298, "rewards/final_brier_reward_step": 0.6917492151260376, "rewards/format_reward_step": 0.96875, "step": 199 }, { "adv/mean_abs_final_conf": 0.38301950693130493, "adv/mean_abs_reasoning": 0.3488479256629944, "adv/mean_abs_step_conf": 0.7581737041473389, "adv/ratio_final_to_reasoning": 1.0979555237525538, "adv/ratio_step_to_reasoning": 2.1733645189559616, "adv/std_final_conf": 0.6806488633155823, "adv/std_reasoning": 0.6402129530906677, "adv/std_step_conf": 0.9278504252433777, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 15.03515625, "calib/ece": 0.26602459016393454, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9713114754098361, "calib/gap": 0.007723930481283392, "calib/mean_conf": 0.9873360655737706, "calib/mu_c": 0.9894886363636364, "calib/mu_w": 0.981764705882353, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26602459016393454, "calib/std_conf": 0.015036627769690402, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40364406779661016, "calib/step_q_c_n": 2183.0, "calib/step_q_gap": 0.14485655279060772, "calib/step_q_w": 0.25878751500600244, "calib/step_q_w_n": 1666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 815.4765625, "completions/mean_terminated_length": 855.5819091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.21333333333333335, "grad_norm": 0.7031919956207275, "kl": 0.115997314453125, "learning_rate": 2.777777777777778e-08, "loss": -0.1807, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01851600408554077, "mask/share_reasoning": 0.7547211647033691, "mask/share_step_conf": 0.17988784611225128, "num_tokens": 63857295.0, "reward": 0.9507389068603516, "reward_std": 0.13375723361968994, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.8765336275100708, "rewards/final_brier_reward_step": 0.6968191266059875, "rewards/format_reward_step": 0.953125, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.09897489776369184, "train_runtime": 14583.0994, "train_samples_per_second": 3.511, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 63857295, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }