{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.6044455766677856, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.266794080007825, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.8712793588638306, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04669388756155968, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": 0.0112, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.5306904315948486, "reward_std": 0.15138749778270721, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.016632115468382835, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.5409098267555237, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.059662587969888, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.8044619560241699, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.03493440896272659, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0693, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.47535353899002075, "reward_std": 0.15537551045417786, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.012380896136164665, "step": 2 }, { "adv/mean_abs_final_conf": 0.7887527942657471, "adv/mean_abs_reasoning": 0.46889227628707886, "adv/mean_abs_step_conf": 0.543759822845459, "adv/ratio_final_to_reasoning": 1.6821620533216757, "adv/ratio_step_to_reasoning": 1.159668969493843, "adv/std_final_conf": 0.9310469627380371, "adv/std_reasoning": 0.720630407333374, "adv/std_step_conf": 0.8235465288162231, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.449075001669672, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.24200787401574808, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.24803149606299213, "calib/gap": -0.00770920991117352, "calib/mean_conf": 0.8758661417322834, "calib/mu_c": 0.8730434782608696, "calib/mu_w": 0.8807526881720431, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24200787401574808, "calib/std_conf": 0.047036568485355625, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7858236994219653, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.03556097364199806, "calib/step_q_w": 0.7502627257799672, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 496.265625, "completions/mean_terminated_length": 498.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.06859400123357773, "kl": 0.0012768059968948364, "learning_rate": 7.5e-07, "loss": -0.0058, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03323023393750191, "mask/share_reasoning": 0.8530337810516357, "mask/share_step_conf": 0.10982976853847504, "num_tokens": 690961.0, "reward": 0.5174956917762756, "reward_std": 0.1475203037261963, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6964457035064697, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.015889329835772514, "step": 3 }, { "adv/mean_abs_final_conf": 0.7630407810211182, "adv/mean_abs_reasoning": 0.3918391764163971, "adv/mean_abs_step_conf": 0.5928314924240112, "adv/ratio_final_to_reasoning": 1.9473315251414651, "adv/ratio_step_to_reasoning": 1.5129459433990462, "adv/std_final_conf": 0.9291425347328186, "adv/std_reasoning": 0.6815277338027954, "adv/std_step_conf": 0.8545830249786377, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4562167553191489, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.2471259842519684, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2559055118110236, "calib/gap": -0.003505319148936148, "calib/mean_conf": 0.8770472440944881, "calib/mu_c": 0.87575, "calib/mu_w": 0.8792553191489362, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2471259842519684, "calib/std_conf": 0.0441403199663946, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7996050955414012, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.014049539985845572, "calib/step_q_w": 0.7855555555555557, "calib/step_q_w_n": 540.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 504.796875, "completions/mean_terminated_length": 504.796875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.004266666666666667, "grad_norm": 0.05691548436880112, "kl": 0.0002792775630950928, "learning_rate": 1.0000000000000002e-06, "loss": -0.0132, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03401482105255127, "mask/share_reasoning": 0.8495725393295288, "mask/share_step_conf": 0.11641263961791992, "num_tokens": 926357.0, "reward": 0.5209752321243286, "reward_std": 0.1293841153383255, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6967394351959229, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.021773390471935272, "step": 4 }, { "adv/mean_abs_final_conf": 0.7602543234825134, "adv/mean_abs_reasoning": 0.396877646446228, "adv/mean_abs_step_conf": 0.4481867551803589, "adv/ratio_final_to_reasoning": 1.9155886714459198, "adv/ratio_step_to_reasoning": 1.129281931581608, "adv/std_final_conf": 0.9304333329200745, "adv/std_reasoning": 0.6815878748893738, "adv/std_step_conf": 0.7514729499816895, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5121826371826371, "calib/avg_num_step_conf": 4.82421875, "calib/ece": 0.33835390946502053, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.23868312757201646, "calib/gap": 0.00017608517608513896, "calib/mean_conf": 0.8761316872427983, "calib/mu_c": 0.8762121212121212, "calib/mu_w": 0.8760360360360361, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.3356378600823045, "calib/std_conf": 0.040234077749832294, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.7997305389221557, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.006220838745788937, "calib/step_q_w": 0.7935097001763668, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 519.20703125, "completions/mean_terminated_length": 523.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.005333333333333333, "grad_norm": 0.04682963714003563, "kl": 0.00027504563331604004, "learning_rate": 1.25e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.033292610198259354, "mask/share_reasoning": 0.8487807512283325, "mask/share_step_conf": 0.11011415719985962, "num_tokens": 1165962.0, "reward": 0.451824426651001, "reward_std": 0.10982717573642731, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6050621271133423, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.0071805063635110855, "step": 5 }, { "adv/mean_abs_final_conf": 0.7697786092758179, "adv/mean_abs_reasoning": 0.3637967109680176, "adv/mean_abs_step_conf": 0.5329043865203857, "adv/ratio_final_to_reasoning": 2.115958132847142, "adv/ratio_step_to_reasoning": 1.4648411336715876, "adv/std_final_conf": 0.9304026961326599, "adv/std_reasoning": 0.6402936577796936, "adv/std_step_conf": 0.7721468806266785, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5600583460172501, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.30850393700787404, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.30708661417322836, "calib/gap": 0.007686453576864594, "calib/mean_conf": 0.8833070866141732, "calib/mu_c": 0.8865753424657536, "calib/mu_w": 0.878888888888889, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30850393700787404, "calib/std_conf": 0.0387614370850464, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7954739652870493, "calib/step_q_c_n": 749.0, "calib/step_q_gap": -0.008452736283631301, "calib/step_q_w": 0.8039267015706806, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 442.4453125, "completions/mean_terminated_length": 444.180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0064, "grad_norm": 0.03694045543670654, "kl": 0.001511305570602417, "learning_rate": 1.5e-06, "loss": -0.0009, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03734651952981949, "mask/share_reasoning": 0.8304245471954346, "mask/share_step_conf": 0.12832267582416534, "num_tokens": 1385180.0, "reward": 0.4941892623901367, "reward_std": 0.12611064314842224, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6574984788894653, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.018380088731646538, "step": 6 }, { "adv/mean_abs_final_conf": 0.7861167192459106, "adv/mean_abs_reasoning": 0.5132968425750732, "adv/mean_abs_step_conf": 0.6604681015014648, "adv/ratio_final_to_reasoning": 1.5315050747286363, "adv/ratio_step_to_reasoning": 1.2867176392281563, "adv/std_final_conf": 0.9294589757919312, "adv/std_reasoning": 0.7575724720954895, "adv/std_step_conf": 0.888586699962616, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.44725492880613366, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.2301574803149607, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3425196850393701, "calib/gap": -0.004422234392114088, "calib/mean_conf": 0.8837007874015749, "calib/mu_c": 0.8821686746987951, "calib/mu_w": 0.8865909090909092, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2301574803149607, "calib/std_conf": 0.04947445295749102, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7900000000000001, "calib/step_q_c_n": 959.0, "calib/step_q_gap": -0.001106719367588771, "calib/step_q_w": 0.7911067193675889, "calib/step_q_w_n": 506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 551.875, "completions/mean_terminated_length": 554.0392456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.007466666666666667, "grad_norm": 0.03534330055117607, "kl": 0.0002709925174713135, "learning_rate": 1.75e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03001858852803707, "mask/share_reasoning": 0.8544216752052307, "mask/share_step_conf": 0.11165347695350647, "num_tokens": 1633884.0, "reward": 0.5264572501182556, "reward_std": 0.1676599234342575, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7105578184127808, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.01423170231282711, "step": 7 }, { "adv/mean_abs_final_conf": 0.7548643350601196, "adv/mean_abs_reasoning": 0.40485942363739014, "adv/mean_abs_step_conf": 0.6347297430038452, "adv/ratio_final_to_reasoning": 1.8645097310028511, "adv/ratio_step_to_reasoning": 1.5677781124648762, "adv/std_final_conf": 0.9299923181533813, "adv/std_reasoning": 0.6815534234046936, "adv/std_step_conf": 0.8721611499786377, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.533046357615894, "calib/avg_num_step_conf": 5.0234375, "calib/ece": 0.28219123505976107, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.33067729083665337, "calib/gap": 0.007439735099337819, "calib/mean_conf": 0.8810756972111554, "calib/mu_c": 0.8840397350993378, "calib/mu_w": 0.8765999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2808366533864543, "calib/std_conf": 0.047713018333970465, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.800791788856305, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.039632848458953984, "calib/step_q_w": 0.761158940397351, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 529.3359375, "completions/mean_terminated_length": 529.3359375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.008533333333333334, "grad_norm": 0.05506416782736778, "kl": 0.00039881467819213867, "learning_rate": 2.0000000000000003e-06, "loss": -0.0175, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03296989947557449, "mask/share_reasoning": 0.858788251876831, "mask/share_step_conf": 0.10824184119701385, "num_tokens": 1875906.0, "reward": 0.50176602602005, "reward_std": 0.1313149333000183, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6687449216842651, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.021505873650312424, "step": 8 }, { "adv/mean_abs_final_conf": 0.7711054682731628, "adv/mean_abs_reasoning": 0.45214176177978516, "adv/mean_abs_step_conf": 0.6083589196205139, "adv/ratio_final_to_reasoning": 1.705450664937092, "adv/ratio_step_to_reasoning": 1.3455048196074708, "adv/std_final_conf": 0.9269088506698608, "adv/std_reasoning": 0.7014849781990051, "adv/std_step_conf": 0.8728888034820557, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5543522267206478, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.26605577689243015, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3107569721115538, "calib/gap": 0.00675978407557376, "calib/mean_conf": 0.8875697211155379, "calib/mu_c": 0.8901282051282052, "calib/mu_w": 0.8833684210526315, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26605577689243015, "calib/std_conf": 0.04006058807176065, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7975698324022346, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.057863950049293456, "calib/step_q_w": 0.7397058823529411, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 498.11328125, "completions/mean_terminated_length": 502.0354309082031, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.0096, "grad_norm": 0.04451728239655495, "kl": 0.00037294626235961914, "learning_rate": 2.25e-06, "loss": -0.0451, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03347175195813179, "mask/share_reasoning": 0.8503499031066895, "mask/share_step_conf": 0.10836583375930786, "num_tokens": 2110959.0, "reward": 0.507091224193573, "reward_std": 0.14986293017864227, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6730566620826721, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.025500783696770668, "step": 9 }, { "adv/mean_abs_final_conf": 0.7563279867172241, "adv/mean_abs_reasoning": 0.40436792373657227, "adv/mean_abs_step_conf": 0.6248204112052917, "adv/ratio_final_to_reasoning": 1.8703956034107645, "adv/ratio_step_to_reasoning": 1.545177979083065, "adv/std_final_conf": 0.9293547868728638, "adv/std_reasoning": 0.6815210580825806, "adv/std_step_conf": 0.8563450574874878, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.49091796875000004, "calib/avg_num_step_conf": 5.24609375, "calib/ece": 0.2720703125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.45703125, "calib/gap": -0.0008541666666667114, "calib/mean_conf": 0.8970703125, "calib/mu_c": 0.8967499999999999, "calib/mu_w": 0.8976041666666666, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2720703125, "calib/std_conf": 0.042079850358007966, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7894386469072164, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.01291307371497652, "calib/step_q_w": 0.7765255731922399, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 509.63671875, "completions/mean_terminated_length": 511.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.010666666666666666, "grad_norm": 0.05300568789243698, "kl": 0.004910945892333984, "learning_rate": 2.5e-06, "loss": 0.0133, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03219065070152283, "mask/share_reasoning": 0.8539580702781677, "mask/share_step_conf": 0.10994504392147064, "num_tokens": 2348226.0, "reward": 0.530168890953064, "reward_std": 0.14572685956954956, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6894316673278809, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.045906148850917816, "step": 10 }, { "adv/mean_abs_final_conf": 0.7720677852630615, "adv/mean_abs_reasoning": 0.46977752447128296, "adv/mean_abs_step_conf": 0.6252034902572632, "adv/ratio_final_to_reasoning": 1.643475358111257, "adv/ratio_step_to_reasoning": 1.330850152869502, "adv/std_final_conf": 0.9288809895515442, "adv/std_reasoning": 0.7394527196884155, "adv/std_step_conf": 0.8714380860328674, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.47403531720078484, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.3512048192771085, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5060240963855421, "calib/gap": -0.011448005232177816, "calib/mean_conf": 0.8977911646586346, "calib/mu_c": 0.8927338129496403, "calib/mu_w": 0.9041818181818181, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.34538152610441775, "calib/std_conf": 0.07417782662020629, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7953314121037465, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.0014460617852751767, "calib/step_q_w": 0.7938853503184713, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 548.30078125, "completions/mean_terminated_length": 550.4509887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.011733333333333333, "grad_norm": 0.04260000213980675, "kl": 0.0010578632354736328, "learning_rate": 2.7500000000000004e-06, "loss": 0.0507, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031202612444758415, "mask/share_reasoning": 0.8554599285125732, "mask/share_step_conf": 0.109431192278862, "num_tokens": 2593071.0, "reward": 0.4667537212371826, "reward_std": 0.16893517971038818, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6046609282493591, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.02728401869535446, "step": 11 }, { "adv/mean_abs_final_conf": 0.7520790100097656, "adv/mean_abs_reasoning": 0.42200881242752075, "adv/mean_abs_step_conf": 0.556553840637207, "adv/ratio_final_to_reasoning": 1.7821405332357456, "adv/ratio_step_to_reasoning": 1.3188204232886584, "adv/std_final_conf": 0.9263741970062256, "adv/std_reasoning": 0.701383113861084, "adv/std_step_conf": 0.8084006905555725, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4681224899598393, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.2426693227091633, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5737051792828686, "calib/gap": -0.005078169822145706, "calib/mean_conf": 0.9081673306772908, "calib/mu_c": 0.9064880952380954, "calib/mu_w": 0.9115662650602411, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24075697211155372, "calib/std_conf": 0.043737580917423656, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7972631578947369, "calib/step_q_c_n": 855.0, "calib/step_q_gap": 0.03908607456140345, "calib/step_q_w": 0.7581770833333334, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 490.5078125, "completions/mean_terminated_length": 494.3700866699219, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0128, "grad_norm": 0.0388772189617157, "kl": 0.0033174753189086914, "learning_rate": 3e-06, "loss": 0.0003, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036001648753881454, "mask/share_reasoning": 0.8289986848831177, "mask/share_step_conf": 0.12718716263771057, "num_tokens": 2822817.0, "reward": 0.5267300605773926, "reward_std": 0.14635935425758362, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6989824771881104, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.028696559369564056, "step": 12 }, { "adv/mean_abs_final_conf": 0.7230167984962463, "adv/mean_abs_reasoning": 0.4463549852371216, "adv/mean_abs_step_conf": 0.630902886390686, "adv/ratio_final_to_reasoning": 1.6198246292962337, "adv/ratio_step_to_reasoning": 1.4134554497145926, "adv/std_final_conf": 0.9265499114990234, "adv/std_reasoning": 0.7391461133956909, "adv/std_step_conf": 0.8726893663406372, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5741363396624473, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.2860236220472442, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5748031496062992, "calib/gap": 0.013309599156118068, "calib/mean_conf": 0.9080708661417323, "calib/mu_c": 0.9131012658227848, "calib/mu_w": 0.8997916666666668, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2860236220472442, "calib/std_conf": 0.04201399862369875, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7828269484808454, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.012518663317068857, "calib/step_q_w": 0.7703082851637766, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 475.1875, "completions/mean_terminated_length": 477.0509948730469, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.013866666666666666, "grad_norm": 0.04528241604566574, "kl": 0.0027894973754882812, "learning_rate": 3.2500000000000002e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034327417612075806, "mask/share_reasoning": 0.8470451831817627, "mask/share_step_conf": 0.11472119390964508, "num_tokens": 3049057.0, "reward": 0.5216453671455383, "reward_std": 0.1522456407546997, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6816074252128601, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.04058952257037163, "step": 13 }, { "adv/mean_abs_final_conf": 0.7593710422515869, "adv/mean_abs_reasoning": 0.45527908205986023, "adv/mean_abs_step_conf": 0.6822381615638733, "adv/ratio_final_to_reasoning": 1.6679242956120364, "adv/ratio_step_to_reasoning": 1.498505396903283, "adv/std_final_conf": 0.9233303070068359, "adv/std_reasoning": 0.7205995917320251, "adv/std_step_conf": 0.9171870946884155, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4897293456495059, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.38156, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.764, "calib/gap": 4.521671726598342e-06, "calib/mean_conf": 0.9295599999999999, "calib/mu_c": 0.9295620437956205, "calib/mu_w": 0.9295575221238939, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.38156, "calib/std_conf": 0.038845931575906364, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7778933333333333, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.016901954022988663, "calib/step_q_w": 0.7609913793103447, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2811.0, "completions/max_terminated_length": 2811.0, "completions/mean_length": 543.46875, "completions/mean_terminated_length": 547.748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.014933333333333333, "grad_norm": 0.05144192650914192, "kl": 0.0051441192626953125, "learning_rate": 3.5e-06, "loss": 0.0097, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0320378839969635, "mask/share_reasoning": 0.845973551273346, "mask/share_step_conf": 0.11417604982852936, "num_tokens": 3293585.0, "reward": 0.46880167722702026, "reward_std": 0.16207382082939148, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5910245776176453, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.044235050678253174, "step": 14 }, { "adv/mean_abs_final_conf": 0.791628360748291, "adv/mean_abs_reasoning": 0.45356833934783936, "adv/mean_abs_step_conf": 0.6169772148132324, "adv/ratio_final_to_reasoning": 1.7453342574275121, "adv/ratio_step_to_reasoning": 1.360273990244446, "adv/std_final_conf": 0.9135460257530212, "adv/std_reasoning": 0.7014039754867554, "adv/std_step_conf": 0.8426421284675598, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4774564283910078, "calib/avg_num_step_conf": 5.2890625, "calib/ece": 0.3621960784313725, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9254901960784314, "calib/gap": 0.0007558726951250039, "calib/mean_conf": 0.9425882352941177, "calib/mu_c": 0.9429054054054052, "calib/mu_w": 0.9421495327102802, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3621960784313725, "calib/std_conf": 0.02980273475449185, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7597831978319783, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.008160470559251065, "calib/step_q_w": 0.7516227272727273, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 466.58203125, "completions/mean_terminated_length": 466.58203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.016, "grad_norm": 0.050164178013801575, "kl": 0.013729095458984375, "learning_rate": 3.7500000000000005e-06, "loss": 0.0226, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034857138991355896, "mask/share_reasoning": 0.8444660902023315, "mask/share_step_conf": 0.12067677080631256, "num_tokens": 3520910.0, "reward": 0.4993246793746948, "reward_std": 0.17530518770217896, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6217886209487915, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.06279823184013367, "step": 15 }, { "adv/mean_abs_final_conf": 0.7583369016647339, "adv/mean_abs_reasoning": 0.4722632169723511, "adv/mean_abs_step_conf": 0.6195244193077087, "adv/ratio_final_to_reasoning": 1.6057505103327392, "adv/ratio_step_to_reasoning": 1.3118201821421531, "adv/std_final_conf": 0.9109633564949036, "adv/std_reasoning": 0.7206650376319885, "adv/std_step_conf": 0.8592668771743774, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4530930217204727, "calib/avg_num_step_conf": 6.328125, "calib/ece": 0.34253968253968253, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9444444444444444, "calib/gap": 0.00030897207367774904, "calib/mean_conf": 0.9496825396825397, "calib/mu_c": 0.9498039215686274, "calib/mu_w": 0.9494949494949496, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34253968253968253, "calib/std_conf": 0.03145761845132872, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7389576547231269, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.016682976611539058, "calib/step_q_w": 0.7222746781115879, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 616.30078125, "completions/mean_terminated_length": 618.7177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.017066666666666667, "grad_norm": 0.02984270639717579, "kl": 0.0108184814453125, "learning_rate": 4.000000000000001e-06, "loss": 0.0295, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.026550419628620148, "mask/share_reasoning": 0.8582978248596191, "mask/share_step_conf": 0.11124549061059952, "num_tokens": 3787531.0, "reward": 0.5103954076766968, "reward_std": 0.18981704115867615, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6332523822784424, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.07113219797611237, "step": 16 }, { "adv/mean_abs_final_conf": 0.720360517501831, "adv/mean_abs_reasoning": 0.47343909740448, "adv/mean_abs_step_conf": 0.633313775062561, "adv/ratio_final_to_reasoning": 1.5215484345315806, "adv/ratio_step_to_reasoning": 1.3376879487447422, "adv/std_final_conf": 0.9117466807365417, "adv/std_reasoning": 0.7574771642684937, "adv/std_step_conf": 0.8587536811828613, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49344487903809936, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.2608627450980392, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9607843137254902, "calib/gap": 0.0007105606258149821, "calib/mean_conf": 0.9549803921568629, "calib/mu_c": 0.9551977401129943, "calib/mu_w": 0.9544871794871793, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2608627450980392, "calib/std_conf": 0.024795232144529607, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7409809809809811, "calib/step_q_c_n": 999.0, "calib/step_q_gap": 0.001966896473938662, "calib/step_q_w": 0.7390140845070424, "calib/step_q_w_n": 426.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 529.3984375, "completions/mean_terminated_length": 529.3984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.018133333333333335, "grad_norm": 0.022662891075015068, "kl": 0.015615463256835938, "learning_rate": 4.25e-06, "loss": 0.0548, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0320490263402462, "mask/share_reasoning": 0.847233772277832, "mask/share_step_conf": 0.12071716785430908, "num_tokens": 4026585.0, "reward": 0.563227653503418, "reward_std": 0.19576594233512878, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7071866989135742, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.0864560529589653, "step": 17 }, { "adv/mean_abs_final_conf": 0.7143813967704773, "adv/mean_abs_reasoning": 0.4436107277870178, "adv/mean_abs_step_conf": 0.5791374444961548, "adv/ratio_final_to_reasoning": 1.610378992262467, "adv/ratio_step_to_reasoning": 1.305508203972481, "adv/std_final_conf": 0.9057754874229431, "adv/std_reasoning": 0.7205713391304016, "adv/std_step_conf": 0.8207805752754211, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5059750662101932, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.41236000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": 0.002271817066080817, "calib/mean_conf": 0.96036, "calib/mu_c": 0.9613868613138684, "calib/mu_w": 0.9591150442477876, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.41236000000000006, "calib/std_conf": 0.020733316184344455, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7414520958083832, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.03702352437981182, "calib/step_q_w": 0.7044285714285714, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 518.6640625, "completions/mean_terminated_length": 522.748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0192, "grad_norm": 0.025784000754356384, "kl": 0.014551162719726562, "learning_rate": 4.5e-06, "loss": -0.0548, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03221127390861511, "mask/share_reasoning": 0.8548977971076965, "mask/share_step_conf": 0.10507843643426895, "num_tokens": 4270083.0, "reward": 0.4716760814189911, "reward_std": 0.1795901209115982, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.562788724899292, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.08212599158287048, "step": 18 }, { "adv/mean_abs_final_conf": 0.7339122295379639, "adv/mean_abs_reasoning": 0.38168931007385254, "adv/mean_abs_step_conf": 0.625856876373291, "adv/ratio_final_to_reasoning": 1.9228000632136126, "adv/ratio_step_to_reasoning": 1.6397023963081252, "adv/std_final_conf": 0.8913288116455078, "adv/std_reasoning": 0.6612443923950195, "adv/std_step_conf": 0.8595508337020874, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4467857142857143, "calib/avg_num_step_conf": 4.44921875, "calib/ece": 0.35854330708661414, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": -0.0007545454545453412, "calib/mean_conf": 0.9648425196850393, "calib/mu_c": 0.9645454545454547, "calib/mu_w": 0.9653, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35854330708661414, "calib/std_conf": 0.020131819604306045, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7394842406876792, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.031661111435978495, "calib/step_q_w": 0.7078231292517007, "calib/step_q_w_n": 441.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 479.828125, "completions/mean_terminated_length": 479.828125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.020266666666666665, "grad_norm": 0.040589913725852966, "kl": 0.02446746826171875, "learning_rate": 4.75e-06, "loss": 0.0162, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03327008709311485, "mask/share_reasoning": 0.8602676391601562, "mask/share_step_conf": 0.1064622700214386, "num_tokens": 4497679.0, "reward": 0.5226638317108154, "reward_std": 0.18506306409835815, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6269656419754028, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.09961201250553131, "step": 19 }, { "adv/mean_abs_final_conf": 0.729914665222168, "adv/mean_abs_reasoning": 0.38741302490234375, "adv/mean_abs_step_conf": 0.5823383927345276, "adv/ratio_final_to_reasoning": 1.884073632800961, "adv/ratio_step_to_reasoning": 1.5031461393981764, "adv/std_final_conf": 0.8936525583267212, "adv/std_reasoning": 0.6815253496170044, "adv/std_step_conf": 0.8105871677398682, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.47535392535392534, "calib/avg_num_step_conf": 5.23828125, "calib/ece": 0.4021513944223108, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.016493564993564913, "calib/mean_conf": 0.9599203187250996, "calib/mu_c": 0.9672142857142855, "calib/mu_w": 0.9507207207207206, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4021513944223108, "calib/std_conf": 0.08783809822337918, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7128631875881523, "calib/step_q_c_n": 709.0, "calib/step_q_gap": -0.004779217475138875, "calib/step_q_w": 0.7176424050632911, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 483.859375, "completions/mean_terminated_length": 483.859375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.021333333333333333, "grad_norm": 0.037850525230169296, "kl": 0.04192161560058594, "learning_rate": 5e-06, "loss": 0.0373, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035860609263181686, "mask/share_reasoning": 0.8376814126968384, "mask/share_step_conf": 0.12645795941352844, "num_tokens": 4726419.0, "reward": 0.5021113157272339, "reward_std": 0.18082544207572937, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5804694890975952, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.11828441917896271, "step": 20 }, { "adv/mean_abs_final_conf": 0.7311071157455444, "adv/mean_abs_reasoning": 0.5237146019935608, "adv/mean_abs_step_conf": 0.7014495730400085, "adv/ratio_final_to_reasoning": 1.3960029240401695, "adv/ratio_step_to_reasoning": 1.3393737168486148, "adv/std_final_conf": 0.8916097283363342, "adv/std_reasoning": 0.7753270268440247, "adv/std_step_conf": 0.8901202082633972, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4472435897435898, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.3778346456692915, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.002244871794871761, "calib/mean_conf": 0.9683858267716536, "calib/mu_c": 0.9674666666666667, "calib/mu_w": 0.9697115384615385, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3778346456692915, "calib/std_conf": 0.013286395412473855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6919458128078818, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.0014023345470123294, "calib/step_q_w": 0.6905434782608695, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 508.2421875, "completions/mean_terminated_length": 508.2421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0224, "grad_norm": 0.02349030040204525, "kl": 0.03557777404785156, "learning_rate": 4.9722222222222224e-06, "loss": 0.0378, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034186892211437225, "mask/share_reasoning": 0.8403205871582031, "mask/share_step_conf": 0.12549255788326263, "num_tokens": 4959489.0, "reward": 0.5053724646568298, "reward_std": 0.22540730237960815, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6093800067901611, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.0857398733496666, "step": 21 }, { "adv/mean_abs_final_conf": 0.6583297252655029, "adv/mean_abs_reasoning": 0.4400283396244049, "adv/mean_abs_step_conf": 0.6259439587593079, "adv/ratio_final_to_reasoning": 1.4961075594072728, "adv/ratio_step_to_reasoning": 1.4225082850199944, "adv/std_final_conf": 0.8842504024505615, "adv/std_reasoning": 0.7572833895683289, "adv/std_step_conf": 0.8597545027732849, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4411902844873859, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.3298425196850394, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0028515834675254537, "calib/mean_conf": 0.9676377952755906, "calib/mu_c": 0.9666049382716048, "calib/mu_w": 0.9694565217391302, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3298425196850394, "calib/std_conf": 0.013248484858454658, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6927090301003345, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.04015542386251769, "calib/step_q_w": 0.6525536062378168, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 484.9296875, "completions/mean_terminated_length": 484.9296875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.023466666666666667, "grad_norm": 0.019175713881850243, "kl": 0.03942108154296875, "learning_rate": 4.944444444444445e-06, "loss": -0.0136, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03305044025182724, "mask/share_reasoning": 0.8448143005371094, "mask/share_step_conf": 0.1221352368593216, "num_tokens": 5185447.0, "reward": 0.5561374425888062, "reward_std": 0.2011936604976654, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6535523533821106, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.1337224692106247, "step": 22 }, { "adv/mean_abs_final_conf": 0.7666720151901245, "adv/mean_abs_reasoning": 0.5246292352676392, "adv/mean_abs_step_conf": 0.6609750390052795, "adv/ratio_final_to_reasoning": 1.4613596872827863, "adv/ratio_step_to_reasoning": 1.2598898318506473, "adv/std_final_conf": 0.902368426322937, "adv/std_reasoning": 0.7576389908790588, "adv/std_step_conf": 0.8911705613136292, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.48464576962283384, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.40019762845849793, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00159467380224243, "calib/mean_conf": 0.9693675889328063, "calib/mu_c": 0.9686805555555557, "calib/mu_w": 0.9702752293577981, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40019762845849793, "calib/std_conf": 0.013987404586703375, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6942194092827004, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.061555200649606845, "calib/step_q_w": 0.6326642086330936, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 500.6875, "completions/mean_terminated_length": 500.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.024533333333333334, "grad_norm": 0.083377905189991, "kl": 0.099700927734375, "learning_rate": 4.9166666666666665e-06, "loss": 0.0351, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0359768271446228, "mask/share_reasoning": 0.8433985710144043, "mask/share_step_conf": 0.12062457948923111, "num_tokens": 5417559.0, "reward": 0.5095414519309998, "reward_std": 0.2534693479537964, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5866917371749878, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.12145359069108963, "step": 23 }, { "adv/mean_abs_final_conf": 0.7554526329040527, "adv/mean_abs_reasoning": 0.5276204347610474, "adv/mean_abs_step_conf": 0.6861553192138672, "adv/ratio_final_to_reasoning": 1.4318107926319945, "adv/ratio_step_to_reasoning": 1.3004714639693935, "adv/std_final_conf": 0.8965315222740173, "adv/std_reasoning": 0.7576207518577576, "adv/std_step_conf": 0.8904493451118469, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.45235368956743, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.44888446215139455, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": -0.007590330788804223, "calib/mean_conf": 0.9637051792828685, "calib/mu_c": 0.9600763358778626, "calib/mu_w": 0.9676666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4453386454183268, "calib/std_conf": 0.05644354028679722, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6370888324873096, "calib/step_q_c_n": 788.0, "calib/step_q_gap": -0.0030645881822681798, "calib/step_q_w": 0.6401534206695778, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 543.73828125, "completions/mean_terminated_length": 545.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0256, "grad_norm": 0.021823352202773094, "kl": 0.04891204833984375, "learning_rate": 4.888888888888889e-06, "loss": 0.0313, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03237513080239296, "mask/share_reasoning": 0.841971218585968, "mask/share_step_conf": 0.12174741923809052, "num_tokens": 5661268.0, "reward": 0.47645303606987, "reward_std": 0.24115003645420074, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5376160144805908, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.11685264110565186, "step": 24 }, { "adv/mean_abs_final_conf": 0.7542558908462524, "adv/mean_abs_reasoning": 0.44566476345062256, "adv/mean_abs_step_conf": 0.553978443145752, "adv/ratio_final_to_reasoning": 1.692428822521932, "adv/ratio_step_to_reasoning": 1.2430384642853418, "adv/std_final_conf": 0.8923020958900452, "adv/std_reasoning": 0.7013866305351257, "adv/std_step_conf": 0.7936119437217712, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.49227133367399073, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.3697254901960786, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.0065910832907513095, "calib/mean_conf": 0.9626666666666667, "calib/mu_c": 0.9653289473684211, "calib/mu_w": 0.9587378640776698, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3681568627450982, "calib/std_conf": 0.06610706309351273, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6531728235294118, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.027330450648055837, "calib/step_q_w": 0.6258423728813559, "calib/step_q_w_n": 590.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 472.76953125, "completions/mean_terminated_length": 474.6235656738281, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.02666666666666667, "grad_norm": 0.018645210191607475, "kl": 0.05243682861328125, "learning_rate": 4.861111111111111e-06, "loss": -0.0266, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032882969826459885, "mask/share_reasoning": 0.838165283203125, "mask/share_step_conf": 0.12504544854164124, "num_tokens": 5885521.0, "reward": 0.5245230197906494, "reward_std": 0.19636517763137817, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6212117075920105, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.10986563563346863, "step": 25 }, { "adv/mean_abs_final_conf": 0.6872193217277527, "adv/mean_abs_reasoning": 0.4882727563381195, "adv/mean_abs_step_conf": 0.5908165574073792, "adv/ratio_final_to_reasoning": 1.4074496535126495, "adv/ratio_step_to_reasoning": 1.2100133577763041, "adv/std_final_conf": 0.8929456472396851, "adv/std_reasoning": 0.7752744555473328, "adv/std_step_conf": 0.8273743987083435, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4585326953748006, "calib/avg_num_step_conf": 5.26171875, "calib/ece": 0.35901726427622854, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": -3.0347332978908348e-05, "calib/mean_conf": 0.9608233731739708, "calib/mu_c": 0.9608114035087718, "calib/mu_w": 0.9608417508417507, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35713147410358576, "calib/std_conf": 0.05848185806590261, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6276171087533157, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.005644090203568597, "calib/step_q_w": 0.6219730185497471, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 514.41796875, "completions/mean_terminated_length": 518.468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.027733333333333332, "grad_norm": 0.03125348687171936, "kl": 0.0532989501953125, "learning_rate": 4.833333333333333e-06, "loss": 0.0522, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030437037348747253, "mask/share_reasoning": 0.8499245047569275, "mask/share_step_conf": 0.11182597279548645, "num_tokens": 6122452.0, "reward": 0.5311751365661621, "reward_std": 0.20167267322540283, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.619178295135498, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.12832826375961304, "step": 26 }, { "adv/mean_abs_final_conf": 0.7396328449249268, "adv/mean_abs_reasoning": 0.5042452216148376, "adv/mean_abs_step_conf": 0.5995136499404907, "adv/ratio_final_to_reasoning": 1.4668118074699128, "adv/ratio_step_to_reasoning": 1.1889327339990599, "adv/std_final_conf": 0.9105829000473022, "adv/std_reasoning": 0.7576658129692078, "adv/std_step_conf": 0.8439804315567017, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4994914176732359, "calib/avg_num_step_conf": 5.89453125, "calib/ece": 0.44286852589641446, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9601593625498008, "calib/gap": 0.01786395422759035, "calib/mean_conf": 0.9467729083665338, "calib/mu_c": 0.9553846153846153, "calib/mu_w": 0.9375206611570249, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4358565737051794, "calib/std_conf": 0.1302539055210768, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.587694094488189, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.027034344376631658, "calib/step_q_w": 0.5606597501115573, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 521.63671875, "completions/mean_terminated_length": 523.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.0288, "grad_norm": 0.023065723478794098, "kl": 0.0671539306640625, "learning_rate": 4.805555555555556e-06, "loss": -0.0279, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031575899571180344, "mask/share_reasoning": 0.8402689695358276, "mask/share_step_conf": 0.12424890697002411, "num_tokens": 6361207.0, "reward": 0.48377370834350586, "reward_std": 0.24267910420894623, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5471562147140503, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.12351616472005844, "step": 27 }, { "adv/mean_abs_final_conf": 0.6833375692367554, "adv/mean_abs_reasoning": 0.43649131059646606, "adv/mean_abs_step_conf": 0.6208552122116089, "adv/ratio_final_to_reasoning": 1.5655238779048626, "adv/ratio_step_to_reasoning": 1.422377025932565, "adv/std_final_conf": 0.8752068281173706, "adv/std_reasoning": 0.7206342816352844, "adv/std_step_conf": 0.8592632412910461, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5106352807714123, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.29656000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": 0.04732983550765746, "calib/mean_conf": 0.95256, "calib/mu_c": 0.9688414634146341, "calib/mu_w": 0.9215116279069766, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.29656000000000005, "calib/std_conf": 0.11311519084543863, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5751779026217229, "calib/step_q_c_n": 801.0, "calib/step_q_gap": -0.0035662834247888187, "calib/step_q_w": 0.5787441860465117, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 527.71875, "completions/mean_terminated_length": 527.71875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.029866666666666666, "grad_norm": 0.01976063847541809, "kl": 0.05889892578125, "learning_rate": 4.777777777777778e-06, "loss": -0.0497, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03161630034446716, "mask/share_reasoning": 0.8620358109474182, "mask/share_step_conf": 0.1063479334115982, "num_tokens": 6603247.0, "reward": 0.5521137118339539, "reward_std": 0.20202390849590302, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6736171245574951, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.1118602454662323, "step": 28 }, { "adv/mean_abs_final_conf": 0.7208306789398193, "adv/mean_abs_reasoning": 0.4676518738269806, "adv/mean_abs_step_conf": 0.5617038011550903, "adv/ratio_final_to_reasoning": 1.5413830656573153, "adv/ratio_step_to_reasoning": 1.2011152581480014, "adv/std_final_conf": 0.9061382412910461, "adv/std_reasoning": 0.7393488883972168, "adv/std_step_conf": 0.8107452392578125, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5335230245944532, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.5564940239043825, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": -0.006050497121925624, "calib/mean_conf": 0.956812749003984, "calib/mu_c": 0.9532692307692308, "calib/mu_w": 0.9593197278911564, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.5494820717131474, "calib/std_conf": 0.09873402234356803, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5588435374149661, "calib/step_q_c_n": 588.0, "calib/step_q_gap": -0.011776814620237386, "calib/step_q_w": 0.5706203520352034, "calib/step_q_w_n": 909.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 578.09765625, "completions/mean_terminated_length": 580.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.030933333333333334, "grad_norm": 0.020227886736392975, "kl": 0.05834197998046875, "learning_rate": 4.75e-06, "loss": -0.018, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027788184583187103, "mask/share_reasoning": 0.8547316789627075, "mask/share_step_conf": 0.11357386410236359, "num_tokens": 6858368.0, "reward": 0.4075689911842346, "reward_std": 0.22545595467090607, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.4374484419822693, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.10190830379724503, "step": 29 }, { "adv/mean_abs_final_conf": 0.7064460515975952, "adv/mean_abs_reasoning": 0.5811172723770142, "adv/mean_abs_step_conf": 0.6413973569869995, "adv/ratio_final_to_reasoning": 1.2156686527453118, "adv/ratio_step_to_reasoning": 1.1037313593578357, "adv/std_final_conf": 0.8989757299423218, "adv/std_reasoning": 0.8098927736282349, "adv/std_step_conf": 0.8913478255271912, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49147523914882546, "calib/avg_num_step_conf": 6.06640625, "calib/ece": 0.4411693548387097, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9314516129032258, "calib/gap": 0.01790264853256973, "calib/mean_conf": 0.9418951612903227, "calib/mu_c": 0.9506299212598425, "calib/mu_w": 0.9327272727272727, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.435483870967742, "calib/std_conf": 0.13226027117553324, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5203122222222222, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.01897068560757631, "calib/step_q_w": 0.5013415366146459, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 622.63671875, "completions/mean_terminated_length": 622.63671875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.032, "grad_norm": 0.02318803034722805, "kl": 0.05533599853515625, "learning_rate": 4.722222222222222e-06, "loss": 0.0183, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.027279648929834366, "mask/share_reasoning": 0.8624259829521179, "mask/share_step_conf": 0.11029434204101562, "num_tokens": 7124747.0, "reward": 0.4773857593536377, "reward_std": 0.2562609910964966, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5316585898399353, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.13170668482780457, "step": 30 }, { "adv/mean_abs_final_conf": 0.7335243821144104, "adv/mean_abs_reasoning": 0.46064838767051697, "adv/mean_abs_step_conf": 0.6094856858253479, "adv/ratio_final_to_reasoning": 1.5923737100737896, "adv/ratio_step_to_reasoning": 1.3231039164328697, "adv/std_final_conf": 0.8986032009124756, "adv/std_reasoning": 0.7393335103988647, "adv/std_step_conf": 0.8598284721374512, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5216468052738337, "calib/avg_num_step_conf": 6.4921875, "calib/ece": 0.5017126984126984, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9444444444444444, "calib/gap": -0.001043204868154124, "calib/mean_conf": 0.9330492063492063, "calib/mu_c": 0.9324862068965518, "calib/mu_w": 0.9335294117647059, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.48722222222222217, "calib/std_conf": 0.16609510984747, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.534985621761658, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.006574498166152409, "calib/step_q_w": 0.5284111235955056, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 619.80859375, "completions/mean_terminated_length": 622.2392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.03306666666666667, "grad_norm": 0.02012249082326889, "kl": 0.0571441650390625, "learning_rate": 4.694444444444445e-06, "loss": -0.0589, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027477439492940903, "mask/share_reasoning": 0.8551351428031921, "mask/share_step_conf": 0.11348114907741547, "num_tokens": 7389330.0, "reward": 0.43877488374710083, "reward_std": 0.22702966630458832, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.4916439950466156, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.09996829926967621, "step": 31 }, { "adv/mean_abs_final_conf": 0.6898159980773926, "adv/mean_abs_reasoning": 0.4708753228187561, "adv/mean_abs_step_conf": 0.586786687374115, "adv/ratio_final_to_reasoning": 1.4649652777470112, "adv/ratio_step_to_reasoning": 1.2461614761663231, "adv/std_final_conf": 0.8823645710945129, "adv/std_reasoning": 0.757497251033783, "adv/std_step_conf": 0.8274481296539307, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5043056358845832, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.42206679999999985, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.916, "calib/gap": 0.032523366107576446, "calib/mean_conf": 0.9179331999999999, "calib/mu_c": 0.9331541353383457, "calib/mu_w": 0.9006307692307692, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.40399999999999986, "calib/std_conf": 0.19783963985450437, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5054385941644562, "calib/step_q_c_n": 754.0, "calib/step_q_gap": -0.009168813242951157, "calib/step_q_w": 0.5146074074074074, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 588.515625, "completions/mean_terminated_length": 590.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.034133333333333335, "grad_norm": 0.022655580192804337, "kl": 0.06809234619140625, "learning_rate": 4.666666666666667e-06, "loss": -0.0424, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028394188731908798, "mask/share_reasoning": 0.85479736328125, "mask/share_step_conf": 0.11290225386619568, "num_tokens": 7646694.0, "reward": 0.48970359563827515, "reward_std": 0.21243546903133392, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5654059648513794, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.11556365340948105, "step": 32 }, { "adv/mean_abs_final_conf": 0.6951466798782349, "adv/mean_abs_reasoning": 0.44092267751693726, "adv/mean_abs_step_conf": 0.6073142290115356, "adv/ratio_final_to_reasoning": 1.576572753737603, "adv/ratio_step_to_reasoning": 1.3773712716062483, "adv/std_final_conf": 0.8783428072929382, "adv/std_reasoning": 0.7205365896224976, "adv/std_step_conf": 0.8275798559188843, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5990796393688956, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.4267992094861661, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9486166007905138, "calib/gap": 0.04681666666666651, "calib/mean_conf": 0.9371533596837944, "calib/mu_c": 0.9595439393939393, "calib/mu_w": 0.9127272727272728, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42110671936758903, "calib/std_conf": 0.1595536993084799, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5150755244755245, "calib/step_q_c_n": 858.0, "calib/step_q_gap": 0.032715027237955496, "calib/step_q_w": 0.482360497237569, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 561.71484375, "completions/mean_terminated_length": 563.9176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0352, "grad_norm": 0.019298963248729706, "kl": 0.06855010986328125, "learning_rate": 4.638888888888889e-06, "loss": -0.0105, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029010724276304245, "mask/share_reasoning": 0.8475635647773743, "mask/share_step_conf": 0.11951947212219238, "num_tokens": 7897365.0, "reward": 0.5079460740089417, "reward_std": 0.21426744759082794, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5690625309944153, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.14604836702346802, "step": 33 }, { "adv/mean_abs_final_conf": 0.7699817419052124, "adv/mean_abs_reasoning": 0.6623140573501587, "adv/mean_abs_step_conf": 0.7156409621238708, "adv/ratio_final_to_reasoning": 1.1625628859303085, "adv/ratio_step_to_reasoning": 1.0805160394557634, "adv/std_final_conf": 0.9187449812889099, "adv/std_reasoning": 0.8590694665908813, "adv/std_step_conf": 0.9215224385261536, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.495647807164379, "calib/avg_num_step_conf": 6.4140625, "calib/ece": 0.4053494623655915, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9274193548387096, "calib/gap": -0.025125320834728293, "calib/mean_conf": 0.920940860215054, "calib/mu_c": 0.9105057471264366, "calib/mu_w": 0.9356310679611649, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3708064516129033, "calib/std_conf": 0.19498509293441776, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49201563517915314, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.015668894541150336, "calib/step_q_w": 0.4763467406380028, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 543.3515625, "completions/mean_terminated_length": 549.7944946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.03626666666666667, "grad_norm": 0.019647598266601562, "kl": 0.0784454345703125, "learning_rate": 4.611111111111112e-06, "loss": -0.0374, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029811084270477295, "mask/share_reasoning": 0.8282800912857056, "mask/share_step_conf": 0.13019010424613953, "num_tokens": 8141575.0, "reward": 0.5082422494888306, "reward_std": 0.2950316071510315, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5750861763954163, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.13592958450317383, "step": 34 }, { "adv/mean_abs_final_conf": 0.7397118210792542, "adv/mean_abs_reasoning": 0.6048840284347534, "adv/mean_abs_step_conf": 0.6018543243408203, "adv/ratio_final_to_reasoning": 1.2228985827140981, "adv/ratio_step_to_reasoning": 0.9949912645209479, "adv/std_final_conf": 0.9181080460548401, "adv/std_reasoning": 0.8429375886917114, "adv/std_step_conf": 0.8277480006217957, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.47159090909090917, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.4174596774193549, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9032258064516129, "calib/gap": 0.023022466039707368, "calib/mean_conf": 0.9179435483870969, "calib/mu_c": 0.9287121212121211, "calib/mu_w": 0.9056896551724137, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.4015725806451613, "calib/std_conf": 0.1946362530639238, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5053319534282018, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.06689966643268613, "calib/step_q_w": 0.4384322869955157, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 609.09765625, "completions/mean_terminated_length": 616.3201904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.037333333333333336, "grad_norm": 0.028731856495141983, "kl": 0.070526123046875, "learning_rate": 4.583333333333333e-06, "loss": -0.0294, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02652989886701107, "mask/share_reasoning": 0.8616190552711487, "mask/share_step_conf": 0.10013231635093689, "num_tokens": 8406760.0, "reward": 0.4994659423828125, "reward_std": 0.2743852734565735, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5574097633361816, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.1454283446073532, "step": 35 }, { "adv/mean_abs_final_conf": 0.6597049236297607, "adv/mean_abs_reasoning": 0.3977164328098297, "adv/mean_abs_step_conf": 0.6627569198608398, "adv/ratio_final_to_reasoning": 1.658731873282194, "adv/ratio_step_to_reasoning": 1.666405672952771, "adv/std_final_conf": 0.8888344764709473, "adv/std_reasoning": 0.6815781593322754, "adv/std_step_conf": 0.8758692741394043, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5233920491273433, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.25962240000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.868, "calib/gap": -0.0036458306399481666, "calib/mean_conf": 0.9000576000000001, "calib/mu_c": 0.899065934065934, "calib/mu_w": 0.9027117647058822, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21584000000000003, "calib/std_conf": 0.20547091794762584, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4623435075885328, "calib/step_q_c_n": 1186.0, "calib/step_q_gap": 0.0674340139176467, "calib/step_q_w": 0.3949094936708861, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 546.93359375, "completions/mean_terminated_length": 549.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0384, "grad_norm": 0.024275243282318115, "kl": 0.081573486328125, "learning_rate": 4.555555555555556e-06, "loss": 0.0249, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03198616951704025, "mask/share_reasoning": 0.8275508880615234, "mask/share_step_conf": 0.1365567147731781, "num_tokens": 8649487.0, "reward": 0.6196425557136536, "reward_std": 0.22895273566246033, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7116386890411377, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.19014638662338257, "step": 36 }, { "adv/mean_abs_final_conf": 0.6607862710952759, "adv/mean_abs_reasoning": 0.3818843960762024, "adv/mean_abs_step_conf": 0.5758591294288635, "adv/ratio_final_to_reasoning": 1.73033063902255, "adv/ratio_step_to_reasoning": 1.5079409772845362, "adv/std_final_conf": 0.8723121881484985, "adv/std_reasoning": 0.6815993785858154, "adv/std_step_conf": 0.8437888622283936, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5174125874125873, "calib/avg_num_step_conf": 6.66015625, "calib/ece": 0.49933333333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.8875, "calib/gap": 0.007034965034965257, "calib/mean_conf": 0.9209166666666668, "calib/mu_c": 0.9247272727272728, "calib/mu_w": 0.9176923076923076, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.4809583333333333, "calib/std_conf": 0.1765219336387282, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.46415400981996724, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.08738015241594527, "calib/step_q_w": 0.376773857404022, "calib/step_q_w_n": 1094.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 602.8125, "completions/mean_terminated_length": 614.8207397460938, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.039466666666666664, "grad_norm": 0.017324605956673622, "kl": 0.072418212890625, "learning_rate": 4.527777777777778e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.027761541306972504, "mask/share_reasoning": 0.8374956250190735, "mask/share_step_conf": 0.115211620926857, "num_tokens": 8910903.0, "reward": 0.4224051833152771, "reward_std": 0.21838746964931488, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.47179919481277466, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.10113612562417984, "step": 37 }, { "adv/mean_abs_final_conf": 0.7052304744720459, "adv/mean_abs_reasoning": 0.522630512714386, "adv/mean_abs_step_conf": 0.6046627759933472, "adv/ratio_final_to_reasoning": 1.349386339517933, "adv/ratio_step_to_reasoning": 1.156960340591119, "adv/std_final_conf": 0.90242600440979, "adv/std_reasoning": 0.7928698062896729, "adv/std_step_conf": 0.8437419533729553, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5821311475409836, "calib/avg_num_step_conf": 6.0625, "calib/ece": 0.40919919028340085, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.854251012145749, "calib/gap": 0.08244124590163959, "calib/mean_conf": 0.8907360323886641, "calib/mu_c": 0.9314560000000003, "calib/mu_w": 0.8490147540983607, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.39693117408906886, "calib/std_conf": 0.22859311154316703, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4171976127320955, "calib/step_q_c_n": 754.0, "calib/step_q_gap": -0.00010827699221521714, "calib/step_q_w": 0.4173058897243107, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 575.53515625, "completions/mean_terminated_length": 577.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.04053333333333333, "grad_norm": 0.03242592141032219, "kl": 0.0756988525390625, "learning_rate": 4.5e-06, "loss": -0.013, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028629377484321594, "mask/share_reasoning": 0.8517426252365112, "mask/share_step_conf": 0.11572177708148956, "num_tokens": 9165128.0, "reward": 0.4896509349346161, "reward_std": 0.2444412112236023, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5658950209617615, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.12434432655572891, "step": 38 }, { "adv/mean_abs_final_conf": 0.7331459522247314, "adv/mean_abs_reasoning": 0.5253596305847168, "adv/mean_abs_step_conf": 0.6769396066665649, "adv/ratio_final_to_reasoning": 1.3955125394936643, "adv/ratio_step_to_reasoning": 1.2885261205036673, "adv/std_final_conf": 0.9099270701408386, "adv/std_reasoning": 0.7754834294319153, "adv/std_step_conf": 0.8914439082145691, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5475911458333335, "calib/avg_num_step_conf": 6.06640625, "calib/ece": 0.4627709677419355, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7943548387096774, "calib/gap": 0.0002566666666665718, "calib/mean_conf": 0.8553741935483872, "calib/mu_c": 0.8555066666666666, "calib/mu_w": 0.8552500000000001, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4171370967741935, "calib/std_conf": 0.2781030059432516, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4405322678843227, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.04696809377717981, "calib/step_q_w": 0.3935641741071429, "calib/step_q_w_n": 896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 574.56640625, "completions/mean_terminated_length": 579.090576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.0416, "grad_norm": 0.0406871996819973, "kl": 0.1913604736328125, "learning_rate": 4.472222222222223e-06, "loss": 0.0816, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029984187334775925, "mask/share_reasoning": 0.8428821563720703, "mask/share_step_conf": 0.11932115256786346, "num_tokens": 9418305.0, "reward": 0.46383437514305115, "reward_std": 0.2803102135658264, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5071962475776672, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.1360975205898285, "step": 39 }, { "adv/mean_abs_final_conf": 0.7949722409248352, "adv/mean_abs_reasoning": 0.5985231399536133, "adv/mean_abs_step_conf": 0.5397994518280029, "adv/ratio_final_to_reasoning": 1.328223067509883, "adv/ratio_step_to_reasoning": 0.9018856846033362, "adv/std_final_conf": 0.9357001185417175, "adv/std_reasoning": 0.826609194278717, "adv/std_step_conf": 0.8273929953575134, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5440705128205128, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.4767741935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.062131410256410224, "calib/mean_conf": 0.8395967741935485, "calib/mu_c": 0.8756730769230768, "calib/mu_w": 0.8135416666666666, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.44850806451612907, "calib/std_conf": 0.2786108952595646, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.38082547008547013, "calib/step_q_c_n": 585.0, "calib/step_q_gap": -0.012701147868600882, "calib/step_q_w": 0.393526617954071, "calib/step_q_w_n": 958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 605.015625, "completions/mean_terminated_length": 609.779541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.042666666666666665, "grad_norm": 0.021353064104914665, "kl": 0.07129669189453125, "learning_rate": 4.444444444444444e-06, "loss": -0.0266, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028631504625082016, "mask/share_reasoning": 0.8494503498077393, "mask/share_step_conf": 0.11410558968782425, "num_tokens": 9679949.0, "reward": 0.44868218898773193, "reward_std": 0.2748994827270508, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.5120519399642944, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.11109365522861481, "step": 40 }, { "adv/mean_abs_final_conf": 0.7356240153312683, "adv/mean_abs_reasoning": 0.45937642455101013, "adv/mean_abs_step_conf": 0.6705185174942017, "adv/ratio_final_to_reasoning": 1.6013534348225635, "adv/ratio_step_to_reasoning": 1.4596276205283274, "adv/std_final_conf": 0.9198719263076782, "adv/std_reasoning": 0.7207380533218384, "adv/std_step_conf": 0.8758156895637512, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5015070055392636, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.2623103174603175, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7261904761904762, "calib/gap": 0.033223753665689304, "calib/mean_conf": 0.8246738095238095, "calib/mu_c": 0.8333752688172044, "calib/mu_w": 0.8001515151515151, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1744444444444445, "calib/std_conf": 0.2917885461481744, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.377892166344294, "calib/step_q_c_n": 1034.0, "calib/step_q_gap": 0.013641598162475788, "calib/step_q_w": 0.3642505681818182, "calib/step_q_w_n": 352.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 499.03125, "completions/mean_terminated_length": 500.9882507324219, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.04373333333333333, "grad_norm": 0.030505994334816933, "kl": 0.0812530517578125, "learning_rate": 4.416666666666667e-06, "loss": -0.0707, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03425697237253189, "mask/share_reasoning": 0.8373620510101318, "mask/share_step_conf": 0.12447471916675568, "num_tokens": 9914949.0, "reward": 0.5994906425476074, "reward_std": 0.2487732619047165, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.6988155841827393, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.16344696283340454, "step": 41 }, { "adv/mean_abs_final_conf": 0.7042899131774902, "adv/mean_abs_reasoning": 0.42739468812942505, "adv/mean_abs_step_conf": 0.6312187910079956, "adv/ratio_final_to_reasoning": 1.6478677268076267, "adv/ratio_step_to_reasoning": 1.4768990081992968, "adv/std_final_conf": 0.9019165635108948, "adv/std_reasoning": 0.7205455899238586, "adv/std_step_conf": 0.8599382638931274, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6168877851120842, "calib/avg_num_step_conf": 6.375, "calib/ece": 0.3709996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.784, "calib/gap": 0.004650722175021382, "calib/mean_conf": 0.8806004000000001, "calib/mu_c": 0.8825909090909092, "calib/mu_w": 0.8779401869158878, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.33979999999999994, "calib/std_conf": 0.21863785774618263, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.34916286031042126, "calib/step_q_c_n": 902.0, "calib/step_q_gap": -0.02968010407314048, "calib/step_q_w": 0.37884296438356174, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 496.12890625, "completions/mean_terminated_length": 496.12890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0448, "grad_norm": 0.021938426420092583, "kl": 0.086944580078125, "learning_rate": 4.388888888888889e-06, "loss": -0.0052, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033286646008491516, "mask/share_reasoning": 0.826934814453125, "mask/share_step_conf": 0.13977853953838348, "num_tokens": 10146326.0, "reward": 0.5242855548858643, "reward_std": 0.22580347955226898, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5959059000015259, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.14719650149345398, "step": 42 }, { "adv/mean_abs_final_conf": 0.8100078105926514, "adv/mean_abs_reasoning": 0.6031104326248169, "adv/mean_abs_step_conf": 0.7097436785697937, "adv/ratio_final_to_reasoning": 1.343050570469142, "adv/ratio_step_to_reasoning": 1.1768055072118297, "adv/std_final_conf": 0.9271444082260132, "adv/std_reasoning": 0.8429136276245117, "adv/std_step_conf": 0.9062215089797974, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.591963727329581, "calib/avg_num_step_conf": 6.12109375, "calib/ece": 0.3472754940711462, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6956521739130435, "calib/gap": 0.12183120700437755, "calib/mean_conf": 0.8286928853754941, "calib/mu_c": 0.8879230769230767, "calib/mu_w": 0.7660918699186992, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.33106719367588927, "calib/std_conf": 0.2787819513384428, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.38588197596795726, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.011821951518079521, "calib/step_q_w": 0.37406002444987774, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 533.41796875, "completions/mean_terminated_length": 535.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.04586666666666667, "grad_norm": 0.025357814505696297, "kl": 0.08330535888671875, "learning_rate": 4.361111111111112e-06, "loss": -0.0157, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032418474555015564, "mask/share_reasoning": 0.8399398326873779, "mask/share_step_conf": 0.12373548001050949, "num_tokens": 10388105.0, "reward": 0.5459194779396057, "reward_std": 0.31446224451065063, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6143790483474731, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.18214738368988037, "step": 43 }, { "adv/mean_abs_final_conf": 0.7766730189323425, "adv/mean_abs_reasoning": 0.46539586782455444, "adv/mean_abs_step_conf": 0.5761933326721191, "adv/ratio_final_to_reasoning": 1.6688438222771969, "adv/ratio_step_to_reasoning": 1.238071440912177, "adv/std_final_conf": 0.9311361312866211, "adv/std_reasoning": 0.7575474977493286, "adv/std_step_conf": 0.8109161853790283, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5664017800381437, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.37617848605577686, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5936254980079682, "calib/gap": 0.05650387794024159, "calib/mean_conf": 0.7825466135458167, "calib/mu_c": 0.8118115702479338, "calib/mu_w": 0.7553076923076922, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3383266932270916, "calib/std_conf": 0.2996780140917351, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.37966990014265334, "calib/step_q_c_n": 701.0, "calib/step_q_gap": 0.03266201577997263, "calib/step_q_w": 0.3470078843626807, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 557.71875, "completions/mean_terminated_length": 557.71875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.046933333333333334, "grad_norm": 0.032184794545173645, "kl": 0.06911468505859375, "learning_rate": 4.333333333333334e-06, "loss": -0.0023, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029920445755124092, "mask/share_reasoning": 0.8550323247909546, "mask/share_step_conf": 0.11504723876714706, "num_tokens": 10637201.0, "reward": 0.5133737325668335, "reward_std": 0.25019070506095886, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5757042169570923, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.1619807332754135, "step": 44 }, { "adv/mean_abs_final_conf": 0.7967990636825562, "adv/mean_abs_reasoning": 0.5444657206535339, "adv/mean_abs_step_conf": 0.6275107860565186, "adv/ratio_final_to_reasoning": 1.4634512944655929, "adv/ratio_step_to_reasoning": 1.152525792263476, "adv/std_final_conf": 0.9364665746688843, "adv/std_reasoning": 0.7927656769752502, "adv/std_step_conf": 0.8600741028785706, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5570528417243745, "calib/avg_num_step_conf": 6.3359375, "calib/ece": 0.3916547244094488, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5748031496062992, "calib/gap": 0.07320076735916137, "calib/mean_conf": 0.7669279527559055, "calib/mu_c": 0.8064102564102563, "calib/mu_w": 0.733209489051095, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34897637795275593, "calib/std_conf": 0.31387597838827136, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3368719178082192, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.015331895386694494, "calib/step_q_w": 0.3215400224215247, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 529.58984375, "completions/mean_terminated_length": 529.58984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.048, "grad_norm": 0.018714895471930504, "kl": 0.07694244384765625, "learning_rate": 4.305555555555556e-06, "loss": 0.0439, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03286263346672058, "mask/share_reasoning": 0.8325269222259521, "mask/share_step_conf": 0.13461042940616608, "num_tokens": 10877824.0, "reward": 0.505123496055603, "reward_std": 0.2853018641471863, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.587199866771698, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.13398459553718567, "step": 45 }, { "adv/mean_abs_final_conf": 0.7839863896369934, "adv/mean_abs_reasoning": 0.522323489189148, "adv/mean_abs_step_conf": 0.685457706451416, "adv/ratio_final_to_reasoning": 1.500959474087695, "adv/ratio_step_to_reasoning": 1.312324106877745, "adv/std_final_conf": 0.9364987015724182, "adv/std_reasoning": 0.7754613757133484, "adv/std_step_conf": 0.8760055303573608, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5495217023491898, "calib/avg_num_step_conf": 6.9296875, "calib/ece": 0.3789681451612903, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5120967741935484, "calib/gap": 0.027758189627123064, "calib/mean_conf": 0.7249834677419356, "calib/mu_c": 0.739198347107438, "calib/mu_w": 0.711440157480315, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.30802419354838706, "calib/std_conf": 0.3408398898204456, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.34816039886039885, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.0753416488603989, "calib/step_q_w": 0.27281874999999994, "calib/step_q_w_n": 1072.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 576.41796875, "completions/mean_terminated_length": 578.678466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.04906666666666667, "grad_norm": 0.027084004133939743, "kl": 0.07015228271484375, "learning_rate": 4.277777777777778e-06, "loss": 0.1402, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034325309097766876, "mask/share_reasoning": 0.82254958152771, "mask/share_step_conf": 0.13921888172626495, "num_tokens": 11130155.0, "reward": 0.4992111325263977, "reward_std": 0.27551764249801636, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5645220279693604, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.147962749004364, "step": 46 }, { "adv/mean_abs_final_conf": 0.8063023686408997, "adv/mean_abs_reasoning": 0.5308449268341064, "adv/mean_abs_step_conf": 0.6474096775054932, "adv/ratio_final_to_reasoning": 1.5189037850462044, "adv/ratio_step_to_reasoning": 1.219583431580602, "adv/std_final_conf": 0.9366827607154846, "adv/std_reasoning": 0.7754331827163696, "adv/std_step_conf": 0.8600805401802063, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5429511028434759, "calib/avg_num_step_conf": 6.95703125, "calib/ece": 0.27374798387096766, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3790322580645161, "calib/gap": 0.05894590752059525, "calib/mean_conf": 0.6826229838709678, "calib/mu_c": 0.7078176056338028, "calib/mu_w": 0.6488716981132076, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1918951612903225, "calib/std_conf": 0.32413858495094305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3177204171240395, "calib/step_q_c_n": 911.0, "calib/step_q_gap": 0.03844064700909694, "calib/step_q_w": 0.27927977011494254, "calib/step_q_w_n": 870.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 573.96875, "completions/mean_terminated_length": 580.7747192382812, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.050133333333333335, "grad_norm": 0.021832922473549843, "kl": 0.0714874267578125, "learning_rate": 4.25e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030538445338606834, "mask/share_reasoning": 0.8278563022613525, "mask/share_step_conf": 0.12988652288913727, "num_tokens": 11383067.0, "reward": 0.5638201832771301, "reward_std": 0.2711506485939026, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6404308080673218, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.18408450484275818, "step": 47 }, { "adv/mean_abs_final_conf": 0.7903881669044495, "adv/mean_abs_reasoning": 0.6245477795600891, "adv/mean_abs_step_conf": 0.6516367793083191, "adv/ratio_final_to_reasoning": 1.2655367495841117, "adv/ratio_step_to_reasoning": 1.0433737828150003, "adv/std_final_conf": 0.933782696723938, "adv/std_reasoning": 0.8431242108345032, "adv/std_step_conf": 0.8599971532821655, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5795031055900621, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.3187391129032259, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.29838709677419356, "calib/gap": 0.08260212487741081, "calib/mean_conf": 0.5967447580645162, "calib/mu_c": 0.6410434782608696, "calib/mu_w": 0.5584413533834588, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.22588709677419358, "calib/std_conf": 0.3521657136801891, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.31856990445859873, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.0052612837689435255, "calib/step_q_w": 0.3133086206896552, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 487.3828125, "completions/mean_terminated_length": 491.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.0512, "grad_norm": 0.027312422171235085, "kl": 0.09047698974609375, "learning_rate": 4.222222222222223e-06, "loss": -0.0647, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03499788045883179, "mask/share_reasoning": 0.824286937713623, "mask/share_step_conf": 0.13290269672870636, "num_tokens": 11611525.0, "reward": 0.5018287897109985, "reward_std": 0.2579227089881897, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6198238134384155, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.10414617508649826, "step": 48 }, { "adv/mean_abs_final_conf": 0.8211216926574707, "adv/mean_abs_reasoning": 0.49190300703048706, "adv/mean_abs_step_conf": 0.638489842414856, "adv/ratio_final_to_reasoning": 1.6692756110892801, "adv/ratio_step_to_reasoning": 1.2979994699956852, "adv/std_final_conf": 0.9365095496177673, "adv/std_reasoning": 0.775332510471344, "adv/std_step_conf": 0.8690310120582581, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5395284327323162, "calib/avg_num_step_conf": 6.421875, "calib/ece": 0.27635960000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.276, "calib/gap": 0.03564363648371971, "calib/mean_conf": 0.5975604, "calib/mu_c": 0.6122455782312926, "calib/mu_w": 0.5766019417475728, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14296000000000003, "calib/std_conf": 0.3306472083533143, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.33921595982142855, "calib/step_q_c_n": 896.0, "calib/step_q_gap": 0.037272644313407155, "calib/step_q_w": 0.3019433155080214, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 498.375, "completions/mean_terminated_length": 500.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.05226666666666667, "grad_norm": 0.031009694561362267, "kl": 0.083892822265625, "learning_rate": 4.194444444444445e-06, "loss": -0.0196, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034359537065029144, "mask/share_reasoning": 0.8226783275604248, "mask/share_step_conf": 0.13905586302280426, "num_tokens": 11843645.0, "reward": 0.5960239171981812, "reward_std": 0.2486967295408249, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6432890892028809, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.24094633758068085, "step": 49 }, { "adv/mean_abs_final_conf": 0.7990678548812866, "adv/mean_abs_reasoning": 0.5750880241394043, "adv/mean_abs_step_conf": 0.6803764700889587, "adv/ratio_final_to_reasoning": 1.3894705181473028, "adv/ratio_step_to_reasoning": 1.183082313541678, "adv/std_final_conf": 0.9364927411079407, "adv/std_reasoning": 0.809965968132019, "adv/std_step_conf": 0.8753259778022766, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6541068842964328, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.20060240963855422, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.18072289156626506, "calib/gap": 0.18681913913386877, "calib/mean_conf": 0.5140160642570281, "calib/mu_c": 0.5942957746478874, "calib/mu_w": 0.40747663551401864, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.07216867469879518, "calib/std_conf": 0.342627362862484, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.3605495133819951, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.05624234920289056, "calib/step_q_w": 0.30430716417910453, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 524.78515625, "completions/mean_terminated_length": 524.78515625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.05333333333333334, "grad_norm": 0.02832883410155773, "kl": 0.08348846435546875, "learning_rate": 4.166666666666667e-06, "loss": -0.0453, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035481106489896774, "mask/share_reasoning": 0.8353006839752197, "mask/share_step_conf": 0.1292182058095932, "num_tokens": 12083350.0, "reward": 0.5965955853462219, "reward_std": 0.23546381294727325, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6845155954360962, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.20945677161216736, "step": 50 }, { "adv/mean_abs_final_conf": 0.8088560104370117, "adv/mean_abs_reasoning": 0.4844133257865906, "adv/mean_abs_step_conf": 0.6645538210868835, "adv/ratio_final_to_reasoning": 1.6697641608508416, "adv/ratio_step_to_reasoning": 1.3718735338417472, "adv/std_final_conf": 0.936579704284668, "adv/std_reasoning": 0.7393724322319031, "adv/std_step_conf": 0.8760242462158203, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5950849939517413, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.23694444444444443, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.15079365079365079, "calib/gap": 0.0998160056025974, "calib/mean_conf": 0.562579365079365, "calib/mu_c": 0.6073381294964028, "calib/mu_w": 0.5075221238938054, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12396825396825398, "calib/std_conf": 0.32518751252832384, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.32191111111111115, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.032344399338445406, "calib/step_q_w": 0.28956671177266574, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 507.90625, "completions/mean_terminated_length": 509.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.0544, "grad_norm": 0.025819355621933937, "kl": 0.08200836181640625, "learning_rate": 4.138888888888889e-06, "loss": 0.0036, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03415883332490921, "mask/share_reasoning": 0.8322888612747192, "mask/share_step_conf": 0.12964603304862976, "num_tokens": 12322670.0, "reward": 0.5817782878875732, "reward_std": 0.2308366894721985, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6703804731369019, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.19083234667778015, "step": 51 }, { "adv/mean_abs_final_conf": 0.7853611707687378, "adv/mean_abs_reasoning": 0.48178425431251526, "adv/mean_abs_step_conf": 0.7020696401596069, "adv/ratio_final_to_reasoning": 1.630109667841705, "adv/ratio_step_to_reasoning": 1.4572282798270964, "adv/std_final_conf": 0.9366201758384705, "adv/std_reasoning": 0.7393109798431396, "adv/std_step_conf": 0.8913068175315857, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6149801119126272, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.2152362204724409, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2637795275590551, "calib/gap": 0.13821951055079884, "calib/mean_conf": 0.6034251968503938, "calib/mu_c": 0.6529447852760736, "calib/mu_w": 0.5147252747252747, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08846456692913379, "calib/std_conf": 0.3401292024007181, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.34160771704180065, "calib/step_q_c_n": 933.0, "calib/step_q_gap": 0.01522137250398553, "calib/step_q_w": 0.3263863445378151, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 493.93359375, "completions/mean_terminated_length": 493.93359375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.055466666666666664, "grad_norm": 0.03201765939593315, "kl": 0.0821533203125, "learning_rate": 4.111111111111111e-06, "loss": 0.026, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03528433293104172, "mask/share_reasoning": 0.8396733999252319, "mask/share_step_conf": 0.12504222989082336, "num_tokens": 12557069.0, "reward": 0.6412832736968994, "reward_std": 0.24396094679832458, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7095109224319458, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.24883678555488586, "step": 52 }, { "adv/mean_abs_final_conf": 0.7655020952224731, "adv/mean_abs_reasoning": 0.44119948148727417, "adv/mean_abs_step_conf": 0.5944796800613403, "adv/ratio_final_to_reasoning": 1.735047585826669, "adv/ratio_step_to_reasoning": 1.3474169961790567, "adv/std_final_conf": 0.9352856874465942, "adv/std_reasoning": 0.7013611197471619, "adv/std_step_conf": 0.8276461958885193, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7021643052125307, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.16596078431372555, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3176470588235294, "calib/gap": 0.20318731314181726, "calib/mean_conf": 0.6830588235294118, "calib/mu_c": 0.7611464968152867, "calib/mu_w": 0.5579591836734694, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11666666666666671, "calib/std_conf": 0.3046220037476665, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.38241730769230764, "calib/step_q_c_n": 936.0, "calib/step_q_gap": 0.05282037220843666, "calib/step_q_w": 0.329596935483871, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 491.421875, "completions/mean_terminated_length": 493.34906005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.05653333333333333, "grad_norm": 0.027992183342576027, "kl": 0.082122802734375, "learning_rate": 4.083333333333334e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03341158479452133, "mask/share_reasoning": 0.8357678055763245, "mask/share_step_conf": 0.1269143521785736, "num_tokens": 12788697.0, "reward": 0.616245687007904, "reward_std": 0.21216043829917908, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7519054412841797, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.16027340292930603, "step": 53 }, { "adv/mean_abs_final_conf": 0.7329035997390747, "adv/mean_abs_reasoning": 0.26085180044174194, "adv/mean_abs_step_conf": 0.5300008058547974, "adv/ratio_final_to_reasoning": 2.809655131756546, "adv/ratio_step_to_reasoning": 2.0318081184690406, "adv/std_final_conf": 0.9351081252098083, "adv/std_reasoning": 0.5482179522514343, "adv/std_step_conf": 0.7759180068969727, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6930783242258652, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.15486274509803927, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4392156862745098, "calib/gap": 0.1926161202185791, "calib/mean_conf": 0.770313725490196, "calib/mu_c": 0.8246994535519125, "calib/mu_w": 0.6320833333333334, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.103764705882353, "calib/std_conf": 0.26864492858557404, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.37470414201183433, "calib/step_q_c_n": 1014.0, "calib/step_q_gap": 0.06894181017326928, "calib/step_q_w": 0.30576233183856505, "calib/step_q_w_n": 446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 442.07421875, "completions/mean_terminated_length": 442.07421875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.0576, "grad_norm": 0.04002760723233223, "kl": 0.093963623046875, "learning_rate": 4.055555555555556e-06, "loss": 0.0098, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03997696191072464, "mask/share_reasoning": 0.8215521574020386, "mask/share_step_conf": 0.13847088813781738, "num_tokens": 13008100.0, "reward": 0.6731469035148621, "reward_std": 0.18424615263938904, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7864863276481628, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2199636995792389, "step": 54 }, { "adv/mean_abs_final_conf": 0.7531729936599731, "adv/mean_abs_reasoning": 0.4917422831058502, "adv/mean_abs_step_conf": 0.6786710023880005, "adv/ratio_final_to_reasoning": 1.5316417146455727, "adv/ratio_step_to_reasoning": 1.3801355419377528, "adv/std_final_conf": 0.9301316738128662, "adv/std_reasoning": 0.7574841976165771, "adv/std_step_conf": 0.8759620189666748, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7387531328320802, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.2932932806324109, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5731225296442688, "calib/gap": 0.20476538847117787, "calib/mean_conf": 0.8083667984189723, "calib/mu_c": 0.9054887218045113, "calib/mu_w": 0.7007233333333335, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.28798418972332, "calib/std_conf": 0.2683947523469457, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.378357771260997, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.013692884745643175, "calib/step_q_w": 0.36466488651535384, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 459.015625, "completions/mean_terminated_length": 460.8157043457031, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.058666666666666666, "grad_norm": 0.030970344319939613, "kl": 0.0832977294921875, "learning_rate": 4.027777777777779e-06, "loss": 0.0321, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037906937301158905, "mask/share_reasoning": 0.8295423984527588, "mask/share_step_conf": 0.1286444365978241, "num_tokens": 13233432.0, "reward": 0.6227953433990479, "reward_std": 0.27070003747940063, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6887440085411072, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.25606536865234375, "step": 55 }, { "adv/mean_abs_final_conf": 0.7386385202407837, "adv/mean_abs_reasoning": 0.5914474129676819, "adv/mean_abs_step_conf": 0.653753399848938, "adv/ratio_final_to_reasoning": 1.2488659245875249, "adv/ratio_step_to_reasoning": 1.1053449309527383, "adv/std_final_conf": 0.9249632954597473, "adv/std_reasoning": 0.8098851442337036, "adv/std_step_conf": 0.875950038433075, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.65819099378882, "calib/avg_num_step_conf": 6.07421875, "calib/ece": 0.3315600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.736, "calib/gap": 0.14646868530020685, "calib/mean_conf": 0.86844, "calib/mu_c": 0.9340579710144926, "calib/mu_w": 0.7875892857142858, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.32400000000000007, "calib/std_conf": 0.24000742988499335, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4133532934131736, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.024954960079840327, "calib/step_q_w": 0.3883983333333333, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 499.828125, "completions/mean_terminated_length": 501.78826904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.05973333333333333, "grad_norm": 0.0224858820438385, "kl": 0.074920654296875, "learning_rate": 4.000000000000001e-06, "loss": 0.0394, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035190023481845856, "mask/share_reasoning": 0.8283034563064575, "mask/share_step_conf": 0.13260021805763245, "num_tokens": 13468228.0, "reward": 0.5894858837127686, "reward_std": 0.25769975781440735, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6431316137313843, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2366214096546173, "step": 56 }, { "adv/mean_abs_final_conf": 0.753666877746582, "adv/mean_abs_reasoning": 0.5388273000717163, "adv/mean_abs_step_conf": 0.6722787618637085, "adv/ratio_final_to_reasoning": 1.3987169500251215, "adv/ratio_step_to_reasoning": 1.24767019372298, "adv/std_final_conf": 0.9096561670303345, "adv/std_reasoning": 0.7755215764045715, "adv/std_step_conf": 0.86024409532547, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6441273779983456, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.31020080321285143, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.8072289156626506, "calib/gap": 0.06920595533498763, "calib/mean_conf": 0.9205622489959839, "calib/mu_c": 0.9464102564102563, "calib/mu_w": 0.8772043010752687, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.30212851405622493, "calib/std_conf": 0.16904077696719494, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.42818713450292395, "calib/step_q_c_n": 855.0, "calib/step_q_gap": 0.061822002923976604, "calib/step_q_w": 0.36636513157894735, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 487.68359375, "completions/mean_terminated_length": 489.5960998535156, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.0608, "grad_norm": 0.023623118177056313, "kl": 0.078521728515625, "learning_rate": 3.972222222222223e-06, "loss": -0.0689, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03461700677871704, "mask/share_reasoning": 0.8309827446937561, "mask/share_step_conf": 0.13049399852752686, "num_tokens": 13699867.0, "reward": 0.6036643385887146, "reward_std": 0.2721608877182007, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6471081972122192, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.24850167334079742, "step": 57 }, { "adv/mean_abs_final_conf": 0.7651212215423584, "adv/mean_abs_reasoning": 0.6441612243652344, "adv/mean_abs_step_conf": 0.7096105813980103, "adv/ratio_final_to_reasoning": 1.187779072384122, "adv/ratio_step_to_reasoning": 1.1016039999881562, "adv/std_final_conf": 0.9263598918914795, "adv/std_reasoning": 0.8429698944091797, "adv/std_step_conf": 0.9068658351898193, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5596751412429379, "calib/avg_num_step_conf": 7.296875, "calib/ece": 0.4251999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.832, "calib/gap": 0.024716230097585945, "calib/mean_conf": 0.93144, "calib/mu_c": 0.9431060606060606, "calib/mu_w": 0.9183898305084747, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4143199999999999, "calib/std_conf": 0.14631037693888976, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.40552511415525117, "calib/step_q_c_n": 876.0, "calib/step_q_gap": 0.012732775445573719, "calib/step_q_w": 0.39279233870967745, "calib/step_q_w_n": 992.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 581.96484375, "completions/mean_terminated_length": 581.96484375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.06186666666666667, "grad_norm": 0.023403571918606758, "kl": 0.067626953125, "learning_rate": 3.944444444444445e-06, "loss": 0.0358, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03249308094382286, "mask/share_reasoning": 0.8342177271842957, "mask/share_step_conf": 0.1332891881465912, "num_tokens": 13955170.0, "reward": 0.5350931882858276, "reward_std": 0.32655808329582214, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5575656294822693, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.21574564278125763, "step": 58 }, { "adv/mean_abs_final_conf": 0.7101393938064575, "adv/mean_abs_reasoning": 0.5235600471496582, "adv/mean_abs_step_conf": 0.6275352239608765, "adv/ratio_final_to_reasoning": 1.35636666256825, "adv/ratio_step_to_reasoning": 1.1985926492620573, "adv/std_final_conf": 0.8879631757736206, "adv/std_reasoning": 0.7576151490211487, "adv/std_step_conf": 0.8437442779541016, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5429366551815531, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.3480559523809525, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8849206349206349, "calib/gap": 0.03082495361781068, "calib/mean_conf": 0.9478170634920635, "calib/mu_c": 0.9598045454545454, "calib/mu_w": 0.9289795918367347, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.34238095238095245, "calib/std_conf": 0.13240877627345576, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.44405078597339787, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.07640372714986843, "calib/step_q_w": 0.36764705882352944, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 520.41015625, "completions/mean_terminated_length": 520.41015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.06293333333333333, "grad_norm": 0.024560702964663506, "kl": 0.082672119140625, "learning_rate": 3.916666666666667e-06, "loss": 0.047, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036245040595531464, "mask/share_reasoning": 0.8422383069992065, "mask/share_step_conf": 0.12151669710874557, "num_tokens": 14194643.0, "reward": 0.535647988319397, "reward_std": 0.27667325735092163, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6130132675170898, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.14578264951705933, "step": 59 }, { "adv/mean_abs_final_conf": 0.7291475534439087, "adv/mean_abs_reasoning": 0.5098700523376465, "adv/mean_abs_step_conf": 0.6434817314147949, "adv/ratio_final_to_reasoning": 1.4300654649178182, "adv/ratio_step_to_reasoning": 1.2620504547473756, "adv/std_final_conf": 0.8896569609642029, "adv/std_reasoning": 0.7753527164459229, "adv/std_step_conf": 0.8600745797157288, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6022727272727273, "calib/avg_num_step_conf": 6.0625, "calib/ece": 0.3914737051792829, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8605577689243028, "calib/gap": 0.03858461538461544, "calib/mean_conf": 0.9294824701195219, "calib/mu_c": 0.9460846153846153, "calib/mu_w": 0.9074999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3756175298804782, "calib/std_conf": 0.18617612343082873, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.424557463672391, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.05356111147113307, "calib/step_q_w": 0.37099635220125793, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 502.29296875, "completions/mean_terminated_length": 502.29296875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.064, "grad_norm": 0.03019045479595661, "kl": 0.09590911865234375, "learning_rate": 3.88888888888889e-06, "loss": 0.0738, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03679346665740013, "mask/share_reasoning": 0.8283411264419556, "mask/share_step_conf": 0.1348654180765152, "num_tokens": 14432086.0, "reward": 0.6083961725234985, "reward_std": 0.26783424615859985, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5977804660797119, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.31198057532310486, "step": 60 }, { "adv/mean_abs_final_conf": 0.661597490310669, "adv/mean_abs_reasoning": 0.4052918553352356, "adv/mean_abs_step_conf": 0.6843038201332092, "adv/ratio_final_to_reasoning": 1.6323976946524896, "adv/ratio_step_to_reasoning": 1.6884223334988806, "adv/std_final_conf": 0.8431702256202698, "adv/std_reasoning": 0.6815150380134583, "adv/std_step_conf": 0.8912301659584045, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5647274633123689, "calib/avg_num_step_conf": 6.265625, "calib/ece": 0.35517647058823537, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.03140526729559756, "calib/mean_conf": 0.9763529411764704, "calib/mu_c": 0.9881761006289308, "calib/mu_w": 0.9567708333333332, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3540000000000001, "calib/std_conf": 0.0906254570843184, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4547078280044101, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.03955861710053643, "calib/step_q_w": 0.4151492109038737, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 430.34765625, "completions/mean_terminated_length": 432.0353088378906, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.02419097349047661, "kl": 0.097686767578125, "learning_rate": 3.861111111111112e-06, "loss": -0.0056, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.041657865047454834, "mask/share_reasoning": 0.8071362972259521, "mask/share_step_conf": 0.14729955792427063, "num_tokens": 14646319.0, "reward": 0.6301652193069458, "reward_std": 0.2503102421760559, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6408730745315552, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.29601991176605225, "step": 61 }, { "adv/mean_abs_final_conf": 0.665837287902832, "adv/mean_abs_reasoning": 0.5249857902526855, "adv/mean_abs_step_conf": 0.648922860622406, "adv/ratio_final_to_reasoning": 1.268295828697291, "adv/ratio_step_to_reasoning": 1.2360769999318786, "adv/std_final_conf": 0.8489472270011902, "adv/std_reasoning": 0.7576203346252441, "adv/std_step_conf": 0.8600762486457825, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6343568196509373, "calib/avg_num_step_conf": 6.3515625, "calib/ece": 0.43188755020080316, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8955823293172691, "calib/gap": 0.06333290239172584, "calib/mean_conf": 0.9300401606425702, "calib/mu_c": 0.9603076923076923, "calib/mu_w": 0.8969747899159665, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4199196787148594, "calib/std_conf": 0.20584530002832052, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42193460490463214, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.0576969367431972, "calib/step_q_w": 0.36423766816143494, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 505.56640625, "completions/mean_terminated_length": 507.5490417480469, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.06613333333333334, "grad_norm": 0.03537999466061592, "kl": 0.0814056396484375, "learning_rate": 3.833333333333334e-06, "loss": -0.0413, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03413922339677811, "mask/share_reasoning": 0.8309030532836914, "mask/share_step_conf": 0.13105152547359467, "num_tokens": 14882824.0, "reward": 0.5236216187477112, "reward_std": 0.2916761040687561, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5576195120811462, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.19274872541427612, "step": 62 }, { "adv/mean_abs_final_conf": 0.6973801255226135, "adv/mean_abs_reasoning": 0.5464081168174744, "adv/mean_abs_step_conf": 0.6797524094581604, "adv/ratio_final_to_reasoning": 1.2762989861579432, "adv/ratio_step_to_reasoning": 1.2440379059837963, "adv/std_final_conf": 0.8700258135795593, "adv/std_reasoning": 0.7928805351257324, "adv/std_step_conf": 0.8913764357566833, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6766145436053903, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.4059523809523811, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.08016018306636163, "calib/mean_conf": 0.9338095238095238, "calib/mu_c": 0.970072463768116, "calib/mu_w": 0.8899122807017543, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.39607142857142874, "calib/std_conf": 0.18517840879514066, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.38816479400749065, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.043150629701541654, "calib/step_q_w": 0.345014164305949, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 550.06640625, "completions/mean_terminated_length": 550.06640625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.0672, "grad_norm": 0.03897147625684738, "kl": 0.07268524169921875, "learning_rate": 3.8055555555555556e-06, "loss": -0.0222, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033824674785137177, "mask/share_reasoning": 0.8467774391174316, "mask/share_step_conf": 0.11939793080091476, "num_tokens": 15132281.0, "reward": 0.5660736560821533, "reward_std": 0.30982935428619385, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5915511846542358, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.23825247585773468, "step": 63 }, { "adv/mean_abs_final_conf": 0.6953163743019104, "adv/mean_abs_reasoning": 0.6094165444374084, "adv/mean_abs_step_conf": 0.6779900789260864, "adv/ratio_final_to_reasoning": 1.1409542137452169, "adv/ratio_step_to_reasoning": 1.1125232570638242, "adv/std_final_conf": 0.8795359134674072, "adv/std_reasoning": 0.8267204761505127, "adv/std_step_conf": 0.8760717511177063, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5825639204545454, "calib/avg_num_step_conf": 6.296875, "calib/ece": 0.3200806451612903, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8911290322580645, "calib/gap": 0.07014772727272733, "calib/mean_conf": 0.9429838709677419, "calib/mu_c": 0.967875, "calib/mu_w": 0.8977272727272727, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3089516129032258, "calib/std_conf": 0.18104748904152584, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39315261044176714, "calib/step_q_c_n": 996.0, "calib/step_q_gap": 0.054922090961247616, "calib/step_q_w": 0.3382305194805195, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 493.47265625, "completions/mean_terminated_length": 493.47265625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.06826666666666667, "grad_norm": 0.025010351091623306, "kl": 0.08414459228515625, "learning_rate": 3.777777777777778e-06, "loss": 0.0187, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03600362688302994, "mask/share_reasoning": 0.8291494250297546, "mask/share_step_conf": 0.13484695553779602, "num_tokens": 15362386.0, "reward": 0.619375467300415, "reward_std": 0.3247393071651459, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6565030813217163, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.26271653175354004, "step": 64 }, { "adv/mean_abs_final_conf": 0.5977113246917725, "adv/mean_abs_reasoning": 0.3444897532463074, "adv/mean_abs_step_conf": 0.6133772134780884, "adv/ratio_final_to_reasoning": 1.7350627095849023, "adv/ratio_step_to_reasoning": 1.7805383402493504, "adv/std_final_conf": 0.8094469308853149, "adv/std_reasoning": 0.618577778339386, "adv/std_step_conf": 0.8277055621147156, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6030788177339902, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.41363281250000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.921875, "calib/gap": 0.0706724137931033, "calib/mean_conf": 0.9434765625, "calib/mu_c": 0.9754999999999999, "calib/mu_w": 0.9048275862068966, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40511718750000003, "calib/std_conf": 0.1964271280607228, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39180116959064326, "calib/step_q_c_n": 855.0, "calib/step_q_gap": 0.008709953374427037, "calib/step_q_w": 0.3830912162162162, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 404.38671875, "completions/mean_terminated_length": 405.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.06933333333333333, "grad_norm": 0.025119824334979057, "kl": 0.0997772216796875, "learning_rate": 3.7500000000000005e-06, "loss": 0.0213, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.042433351278305054, "mask/share_reasoning": 0.8121272325515747, "mask/share_step_conf": 0.14153316617012024, "num_tokens": 15570933.0, "reward": 0.6205974817276001, "reward_std": 0.22863918542861938, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5913464426994324, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.3412547707557678, "step": 65 }, { "adv/mean_abs_final_conf": 0.7006769776344299, "adv/mean_abs_reasoning": 0.4735182225704193, "adv/mean_abs_step_conf": 0.5800496339797974, "adv/ratio_final_to_reasoning": 1.479725476732268, "adv/ratio_step_to_reasoning": 1.2249784830477886, "adv/std_final_conf": 0.8699476718902588, "adv/std_reasoning": 0.7393708825111389, "adv/std_step_conf": 0.8110731244087219, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7274458700882117, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.3966530612244896, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7428571428571429, "calib/gap": 0.18507484629778137, "calib/mean_conf": 0.8544489795918369, "calib/mu_c": 0.9518965517241379, "calib/mu_w": 0.7668217054263565, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3888163265306121, "calib/std_conf": 0.283101269918683, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3741985815602837, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.10313516692613739, "calib/step_q_w": 0.2710634146341463, "calib/step_q_w_n": 1025.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 554.42578125, "completions/mean_terminated_length": 554.42578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.0704, "grad_norm": 0.02820185199379921, "kl": 0.07810211181640625, "learning_rate": 3.7222222222222225e-06, "loss": 0.098, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03546295315027237, "mask/share_reasoning": 0.8319535851478577, "mask/share_step_conf": 0.13258343935012817, "num_tokens": 15819218.0, "reward": 0.5298022031784058, "reward_std": 0.292568564414978, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.5911476612091064, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.185644268989563, "step": 66 }, { "adv/mean_abs_final_conf": 0.5539219379425049, "adv/mean_abs_reasoning": 0.32676050066947937, "adv/mean_abs_step_conf": 0.6197940707206726, "adv/ratio_final_to_reasoning": 1.6951924630045814, "adv/ratio_step_to_reasoning": 1.8967839425230861, "adv/std_final_conf": 0.7640647292137146, "adv/std_reasoning": 0.6185100078582764, "adv/std_step_conf": 0.8601025342941284, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7954672051246345, "calib/avg_num_step_conf": 6.20703125, "calib/ece": 0.23015810276679846, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.758893280632411, "calib/gap": 0.28796407185628736, "calib/mean_conf": 0.8500790513833992, "calib/mu_c": 0.9479640718562874, "calib/mu_w": 0.66, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21007905138339927, "calib/std_conf": 0.3014549594302738, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.373006993006993, "calib/step_q_c_n": 1001.0, "calib/step_q_gap": 0.05856821749678898, "calib/step_q_w": 0.31443877551020405, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 495.59765625, "completions/mean_terminated_length": 495.59765625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.07146666666666666, "grad_norm": 0.031442463397979736, "kl": 0.08463287353515625, "learning_rate": 3.694444444444445e-06, "loss": 0.0812, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03558950126171112, "mask/share_reasoning": 0.8336166143417358, "mask/share_step_conf": 0.13079385459423065, "num_tokens": 16051099.0, "reward": 0.7155471444129944, "reward_std": 0.2193281650543213, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7687581777572632, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.3342110514640808, "step": 67 }, { "adv/mean_abs_final_conf": 0.7066925764083862, "adv/mean_abs_reasoning": 0.5119824409484863, "adv/mean_abs_step_conf": 0.5657267570495605, "adv/ratio_final_to_reasoning": 1.3803062759324023, "adv/ratio_step_to_reasoning": 1.1049729674351894, "adv/std_final_conf": 0.8748133182525635, "adv/std_reasoning": 0.7753519415855408, "adv/std_step_conf": 0.7939915657043457, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7254966668866741, "calib/avg_num_step_conf": 6.28515625, "calib/ece": 0.3297798387096773, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8024193548387096, "calib/gap": 0.21026542142432825, "calib/mean_conf": 0.8761072580645162, "calib/mu_c": 0.9685223021582733, "calib/mu_w": 0.758256880733945, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3227016129032257, "calib/std_conf": 0.27274615330168983, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.37905317769131, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.05819398914715723, "calib/step_q_w": 0.32085918854415274, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2645.0, "completions/max_terminated_length": 2645.0, "completions/mean_length": 501.390625, "completions/mean_terminated_length": 501.390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.07253333333333334, "grad_norm": 0.023501932621002197, "kl": 0.0880279541015625, "learning_rate": 3.6666666666666666e-06, "loss": 0.0716, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.039146944880485535, "mask/share_reasoning": 0.8217398524284363, "mask/share_step_conf": 0.139113187789917, "num_tokens": 16283543.0, "reward": 0.6109815239906311, "reward_std": 0.2679465115070343, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6618926525115967, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.25772660970687866, "step": 68 }, { "adv/mean_abs_final_conf": 0.7272955775260925, "adv/mean_abs_reasoning": 0.5695148706436157, "adv/mean_abs_step_conf": 0.5426146388053894, "adv/ratio_final_to_reasoning": 1.277044051025686, "adv/ratio_step_to_reasoning": 0.9527664101065069, "adv/std_final_conf": 0.8821682333946228, "adv/std_reasoning": 0.7929249405860901, "adv/std_step_conf": 0.7939866781234741, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7060540247047645, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.2931020408163265, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.49795918367346936, "calib/gap": 0.2769784172661869, "calib/mean_conf": 0.6728571428571427, "calib/mu_c": 0.8299999999999998, "calib/mu_w": 0.553021582733813, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2666530612244898, "calib/std_conf": 0.37679204463199484, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4180929487179487, "calib/step_q_c_n": 624.0, "calib/step_q_gap": 0.09029651639895608, "calib/step_q_w": 0.32779643231899264, "calib/step_q_w_n": 953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 581.64453125, "completions/mean_terminated_length": 583.925537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.0736, "grad_norm": 0.022606806829571724, "kl": 0.07575225830078125, "learning_rate": 3.638888888888889e-06, "loss": -0.0434, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03241695091128349, "mask/share_reasoning": 0.8497646450996399, "mask/share_step_conf": 0.11391216516494751, "num_tokens": 16536940.0, "reward": 0.5447255373001099, "reward_std": 0.2545296549797058, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.6584327816963196, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.15836204588413239, "step": 69 }, { "adv/mean_abs_final_conf": 0.7562627792358398, "adv/mean_abs_reasoning": 0.5041773915290833, "adv/mean_abs_step_conf": 0.56196129322052, "adv/ratio_final_to_reasoning": 1.499993438702646, "adv/ratio_step_to_reasoning": 1.1146102595282747, "adv/std_final_conf": 0.917499840259552, "adv/std_reasoning": 0.7576442956924438, "adv/std_step_conf": 0.7939934134483337, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.760593220338983, "calib/avg_num_step_conf": 5.9921875, "calib/ece": 0.2526694915254236, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5550847457627118, "calib/gap": 0.3704237288135594, "calib/mean_conf": 0.6841949152542374, "calib/mu_c": 0.869406779661017, "calib/mu_w": 0.4989830508474576, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.21843220338983038, "calib/std_conf": 0.39938059581447927, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3935294117647059, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.0948760159801626, "calib/step_q_w": 0.2986533957845433, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 569.0859375, "completions/mean_terminated_length": 571.3176879882812, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.07466666666666667, "grad_norm": 0.02504328079521656, "kl": 0.07471084594726562, "learning_rate": 3.6111111111111115e-06, "loss": 0.0555, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.03502859175205231, "mask/share_reasoning": 0.8337832689285278, "mask/share_step_conf": 0.12728188931941986, "num_tokens": 16789618.0, "reward": 0.5850033760070801, "reward_std": 0.27402204275131226, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6838277578353882, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.20961657166481018, "step": 70 }, { "adv/mean_abs_final_conf": 0.6755242347717285, "adv/mean_abs_reasoning": 0.5304654240608215, "adv/mean_abs_step_conf": 0.6683783531188965, "adv/ratio_final_to_reasoning": 1.2734557317618405, "adv/ratio_step_to_reasoning": 1.25998476583511, "adv/std_final_conf": 0.8901195526123047, "adv/std_reasoning": 0.7754287123680115, "adv/std_step_conf": 0.8602496981620789, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6632591093117408, "calib/avg_num_step_conf": 6.609375, "calib/ece": 0.27884462151394424, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5099601593625498, "calib/gap": 0.23326383265856954, "calib/mean_conf": 0.6331872509960159, "calib/mu_c": 0.721474358974359, "calib/mu_w": 0.48821052631578943, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1452589641434263, "calib/std_conf": 0.4081854228644826, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3722026431718062, "calib/step_q_c_n": 908.0, "calib/step_q_gap": 0.020047030926908183, "calib/step_q_w": 0.352155612244898, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 519.62109375, "completions/mean_terminated_length": 521.6588745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.07573333333333333, "grad_norm": 2.9167520999908447, "kl": 1.803070068359375, "learning_rate": 3.5833333333333335e-06, "loss": 0.0878, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03633279353380203, "mask/share_reasoning": 0.8216161727905273, "mask/share_step_conf": 0.1381448209285736, "num_tokens": 17027049.0, "reward": 0.654441237449646, "reward_std": 0.2373964935541153, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6939339637756348, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.296979695558548, "step": 71 }, { "adv/mean_abs_final_conf": 0.6847522258758545, "adv/mean_abs_reasoning": 0.4330241084098816, "adv/mean_abs_step_conf": 0.5937687158584595, "adv/ratio_final_to_reasoning": 1.581325872109869, "adv/ratio_step_to_reasoning": 1.3712139909227041, "adv/std_final_conf": 0.8873276114463806, "adv/std_reasoning": 0.7393446564674377, "adv/std_step_conf": 0.8441863059997559, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7949832775919732, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.16163265306122443, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.46938775510204084, "calib/gap": 0.4156722408026756, "calib/mean_conf": 0.620734693877551, "calib/mu_c": 0.8158461538461539, "calib/mu_w": 0.40017391304347827, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12587755102040812, "calib/std_conf": 0.3975797584215511, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39094910591471804, "calib/step_q_c_n": 727.0, "calib/step_q_gap": 0.06935278940721462, "calib/step_q_w": 0.3215963165075034, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 447.27734375, "completions/mean_terminated_length": 449.0314025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0768, "grad_norm": 0.0335516631603241, "kl": 0.1051177978515625, "learning_rate": 3.555555555555556e-06, "loss": -0.086, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03553992509841919, "mask/share_reasoning": 0.8313742280006409, "mask/share_step_conf": 0.12917959690093994, "num_tokens": 17245960.0, "reward": 0.6857629418373108, "reward_std": 0.2375515103340149, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7538734674453735, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.32546502351760864, "step": 72 }, { "adv/mean_abs_final_conf": 0.704717755317688, "adv/mean_abs_reasoning": 0.6175685524940491, "adv/mean_abs_step_conf": 0.6717692613601685, "adv/ratio_final_to_reasoning": 1.1411166460333628, "adv/ratio_step_to_reasoning": 1.0877646840131836, "adv/std_final_conf": 0.8788755536079407, "adv/std_reasoning": 0.8267543315887451, "adv/std_step_conf": 0.8603302836418152, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7280585569884978, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.21044715447154472, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5528455284552846, "calib/gap": 0.3082405019170442, "calib/mean_conf": 0.687520325203252, "calib/mu_c": 0.8065562913907285, "calib/mu_w": 0.49831578947368427, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14207317073170733, "calib/std_conf": 0.38439260184232704, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.42037164750957856, "calib/step_q_c_n": 870.0, "calib/step_q_gap": 0.10647182481454309, "calib/step_q_w": 0.31389982269503547, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 491.4453125, "completions/mean_terminated_length": 491.4453125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.07786666666666667, "grad_norm": 0.03804392367601395, "kl": 0.09320831298828125, "learning_rate": 3.5277777777777784e-06, "loss": 0.0083, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.033659882843494415, "mask/share_reasoning": 0.8447611927986145, "mask/share_step_conf": 0.1215788796544075, "num_tokens": 17478802.0, "reward": 0.6543662548065186, "reward_std": 0.2646758258342743, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7261413931846619, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.27321600914001465, "step": 73 }, { "adv/mean_abs_final_conf": 0.6597466468811035, "adv/mean_abs_reasoning": 0.5524695515632629, "adv/mean_abs_step_conf": 0.6290533542633057, "adv/ratio_final_to_reasoning": 1.1941773895308625, "adv/ratio_step_to_reasoning": 1.1386208569926466, "adv/std_final_conf": 0.8534355759620667, "adv/std_reasoning": 0.8100982904434204, "adv/std_step_conf": 0.8441713452339172, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.7994919950738916, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.17710526315789468, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.8671875, "calib/frac_conf_gt_0.9": 0.42543859649122806, "calib/gap": 0.4420905172413792, "calib/mean_conf": 0.5757017543859648, "calib/mu_c": 0.8006249999999999, "calib/mu_w": 0.3585344827586207, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.13078947368421046, "calib/std_conf": 0.4133622711253466, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.31646752283105023, "calib/step_q_c_n": 584.0, "calib/step_q_gap": 0.09388214031738903, "calib/step_q_w": 0.2225853825136612, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 496.8046875, "completions/mean_terminated_length": 500.7165222167969, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.07893333333333333, "grad_norm": 0.04059533402323723, "kl": 0.106109619140625, "learning_rate": 3.5e-06, "loss": 0.0325, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.03531833365559578, "mask/share_reasoning": 0.8305954337120056, "mask/share_step_conf": 0.1262737512588501, "num_tokens": 17709912.0, "reward": 0.5903666615486145, "reward_std": 0.24747243523597717, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6874351501464844, "rewards/format_reward_step": 0.8671875, "rewards/step_margin_reward": 0.23236066102981567, "step": 74 }, { "adv/mean_abs_final_conf": 0.6986120343208313, "adv/mean_abs_reasoning": 0.5510978698730469, "adv/mean_abs_step_conf": 0.6533565521240234, "adv/ratio_final_to_reasoning": 1.2676732618867976, "adv/ratio_step_to_reasoning": 1.1855544864917247, "adv/std_final_conf": 0.889663577079773, "adv/std_reasoning": 0.7930936813354492, "adv/std_step_conf": 0.8602217435836792, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7936439346323068, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.1379735772357723, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6747967479674797, "calib/gap": 0.44877192017598977, "calib/mean_conf": 0.7591808943089431, "calib/mu_c": 0.8941773255813953, "calib/mu_w": 0.4454054054054055, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09898373983739833, "calib/std_conf": 0.37288879529876306, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.33779599723947545, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.04144474275918869, "calib/step_q_w": 0.29635125448028676, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 435.06640625, "completions/mean_terminated_length": 435.06640625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.08, "grad_norm": 0.03380579501390457, "kl": 0.129364013671875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0647, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.037903718650341034, "mask/share_reasoning": 0.8313228487968445, "mask/share_step_conf": 0.1307734102010727, "num_tokens": 17926041.0, "reward": 0.6791931390762329, "reward_std": 0.2582167983055115, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7923245429992676, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.24106162786483765, "step": 75 }, { "adv/mean_abs_final_conf": 0.670658528804779, "adv/mean_abs_reasoning": 0.5290241241455078, "adv/mean_abs_step_conf": 0.613515317440033, "adv/ratio_final_to_reasoning": 1.2677276861202549, "adv/ratio_step_to_reasoning": 1.1597114185123358, "adv/std_final_conf": 0.8717346787452698, "adv/std_reasoning": 0.7579178810119629, "adv/std_step_conf": 0.8278359174728394, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7431506849315068, "calib/avg_num_step_conf": 5.48046875, "calib/ece": 0.21907076271186438, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.614406779661017, "calib/gap": 0.35976729071537283, "calib/mean_conf": 0.7021156779661016, "calib/mu_c": 0.8393150684931506, "calib/mu_w": 0.4795477777777778, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.15127118644067794, "calib/std_conf": 0.40472681795188176, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.33938511326860843, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.07004982490106199, "calib/step_q_w": 0.26933528836754644, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 492.046875, "completions/mean_terminated_length": 495.9212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.08106666666666666, "grad_norm": 0.03144654259085655, "kl": 0.125335693359375, "learning_rate": 3.444444444444445e-06, "loss": -0.0009, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.03579811006784439, "mask/share_reasoning": 0.8374246954917908, "mask/share_step_conf": 0.11896469444036484, "num_tokens": 18155061.0, "reward": 0.6285925507545471, "reward_std": 0.2776246964931488, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6818852424621582, "rewards/format_reward_step": 0.8984375, "rewards/step_margin_reward": 0.28076860308647156, "step": 76 }, { "adv/mean_abs_final_conf": 0.6806288361549377, "adv/mean_abs_reasoning": 0.5629649758338928, "adv/mean_abs_step_conf": 0.6627333760261536, "adv/ratio_final_to_reasoning": 1.2090074256338152, "adv/ratio_step_to_reasoning": 1.1772195509045276, "adv/std_final_conf": 0.85959792137146, "adv/std_reasoning": 0.7756574153900146, "adv/std_step_conf": 0.8602848649024963, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6311702127659575, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.27256147540983594, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6270491803278688, "calib/gap": 0.22283475177304968, "calib/mean_conf": 0.7275204918032786, "calib/mu_c": 0.8133666666666667, "calib/mu_w": 0.590531914893617, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.1926639344262294, "calib/std_conf": 0.3840531517258748, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.33040831407787763, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.044286915645086744, "calib/step_q_w": 0.2861213984327909, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 476.42578125, "completions/mean_terminated_length": 480.1771545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.08213333333333334, "grad_norm": 0.02921757660806179, "kl": 0.13333892822265625, "learning_rate": 3.416666666666667e-06, "loss": -0.0985, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.040506187826395035, "mask/share_reasoning": 0.8274651765823364, "mask/share_step_conf": 0.12421616911888123, "num_tokens": 18381690.0, "reward": 0.603038489818573, "reward_std": 0.301956444978714, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6591010689735413, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.24463218450546265, "step": 77 }, { "adv/mean_abs_final_conf": 0.7820947170257568, "adv/mean_abs_reasoning": 0.7057161331176758, "adv/mean_abs_step_conf": 0.6220079660415649, "adv/ratio_final_to_reasoning": 1.1082284793046457, "adv/ratio_step_to_reasoning": 0.8813854988601305, "adv/std_final_conf": 0.9215282797813416, "adv/std_reasoning": 0.9057537913322449, "adv/std_step_conf": 0.844108521938324, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.703433628318584, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.26, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.5630252100840336, "calib/gap": 0.3393500884955752, "calib/mean_conf": 0.6599999999999999, "calib/mu_c": 0.82112, "calib/mu_w": 0.4817699115044248, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.19739495798319331, "calib/std_conf": 0.4234214381508266, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.3207333754208754, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.02973395312162641, "calib/step_q_w": 0.29099942229924897, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 526.05078125, "completions/mean_terminated_length": 526.05078125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0832, "grad_norm": 0.0227784663438797, "kl": 0.1304931640625, "learning_rate": 3.3888888888888893e-06, "loss": -0.0972, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.031548913568258286, "mask/share_reasoning": 0.8640303611755371, "mask/share_step_conf": 0.1044207513332367, "num_tokens": 18624383.0, "reward": 0.570490837097168, "reward_std": 0.32064151763916016, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6435176134109497, "rewards/format_reward_step": 0.88671875, "rewards/step_margin_reward": 0.22246401011943817, "step": 78 }, { "adv/mean_abs_final_conf": 0.7220035791397095, "adv/mean_abs_reasoning": 0.59087735414505, "adv/mean_abs_step_conf": 0.5776523351669312, "adv/ratio_final_to_reasoning": 1.221917838067069, "adv/ratio_step_to_reasoning": 0.9776179965514935, "adv/std_final_conf": 0.8911521434783936, "adv/std_reasoning": 0.826758623123169, "adv/std_step_conf": 0.8270339369773865, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7353469974829199, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.24522408963585446, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.6386554621848739, "calib/gap": 0.4036320268488553, "calib/mean_conf": 0.7095658263305322, "calib/mu_c": 0.8842469135802469, "calib/mu_w": 0.4806148867313916, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.19378151260504212, "calib/std_conf": 0.40900581805842684, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.2904243922538113, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.005916032959937978, "calib/step_q_w": 0.2845083592938733, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 515.78125, "completions/mean_terminated_length": 515.78125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.08426666666666667, "grad_norm": 0.031095637008547783, "kl": 0.1221160888671875, "learning_rate": 3.3611111111111117e-06, "loss": -0.1119, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.031694360077381134, "mask/share_reasoning": 0.8600313663482666, "mask/share_step_conf": 0.10827426612377167, "num_tokens": 18862799.0, "reward": 0.5658245086669922, "reward_std": 0.2791858911514282, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6776552200317383, "rewards/format_reward_step": 0.89453125, "rewards/step_margin_reward": 0.16961871087551117, "step": 79 }, { "adv/mean_abs_final_conf": 0.6892819404602051, "adv/mean_abs_reasoning": 0.6510529518127441, "adv/mean_abs_step_conf": 0.6787902116775513, "adv/ratio_final_to_reasoning": 1.0587187087333203, "adv/ratio_step_to_reasoning": 1.0426036926606008, "adv/std_final_conf": 0.8760189414024353, "adv/std_reasoning": 0.8431887626647949, "adv/std_step_conf": 0.8915259838104248, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6643103448275861, "calib/avg_num_step_conf": 6.1015625, "calib/ece": 0.30814965986394555, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.7591836734693878, "calib/gap": 0.18277356321839078, "calib/mean_conf": 0.8436054421768707, "calib/mu_c": 0.918206896551724, "calib/mu_w": 0.7354333333333333, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2799591836734694, "calib/std_conf": 0.31471157059885074, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.3242539448848622, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.04618175553574977, "calib/step_q_w": 0.27807218934911243, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 459.8203125, "completions/mean_terminated_length": 459.8203125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.08533333333333333, "grad_norm": 0.02338639460504055, "kl": 0.14605712890625, "learning_rate": 3.3333333333333333e-06, "loss": -0.041, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03564756363630295, "mask/share_reasoning": 0.8302156329154968, "mask/share_step_conf": 0.13413682579994202, "num_tokens": 19082673.0, "reward": 0.5761132836341858, "reward_std": 0.31394076347351074, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6432272791862488, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.20821793377399445, "step": 80 }, { "adv/mean_abs_final_conf": 0.7191013693809509, "adv/mean_abs_reasoning": 0.6460109949111938, "adv/mean_abs_step_conf": 0.6267831921577454, "adv/ratio_final_to_reasoning": 1.1131410688757777, "adv/ratio_step_to_reasoning": 0.9702361060339357, "adv/std_final_conf": 0.891799807548523, "adv/std_reasoning": 0.8592842817306519, "adv/std_step_conf": 0.8599952459335327, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.6549204162840526, "calib/avg_num_step_conf": 5.87109375, "calib/ece": 0.30724890829694324, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.6943231441048034, "calib/gap": 0.2664118457300275, "calib/mean_conf": 0.7549344978165938, "calib/mu_c": 0.8805785123966942, "calib/mu_w": 0.6141666666666666, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.2668995633187773, "calib/std_conf": 0.39425749137916277, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.3823559774964838, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.08980667281886057, "calib/step_q_w": 0.29254930467762325, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 517.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0864, "grad_norm": 0.02491418458521366, "kl": 0.139801025390625, "learning_rate": 3.3055555555555558e-06, "loss": -0.1251, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.032717056572437286, "mask/share_reasoning": 0.8427871465682983, "mask/share_step_conf": 0.12449577450752258, "num_tokens": 19321273.0, "reward": 0.4967041015625, "reward_std": 0.3085654079914093, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5815199613571167, "rewards/format_reward_step": 0.859375, "rewards/step_margin_reward": 0.14470072090625763, "step": 81 }, { "adv/mean_abs_final_conf": 0.7451927661895752, "adv/mean_abs_reasoning": 0.6260879039764404, "adv/mean_abs_step_conf": 0.6513978242874146, "adv/ratio_final_to_reasoning": 1.1902366448172375, "adv/ratio_step_to_reasoning": 1.0404255059876168, "adv/std_final_conf": 0.9069758653640747, "adv/std_reasoning": 0.8591532111167908, "adv/std_step_conf": 0.8760126829147339, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6043019763949997, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.3746388888888889, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.7833333333333333, "calib/gap": 0.15537956561212374, "calib/mean_conf": 0.8392222222222223, "calib/mu_c": 0.9110852713178295, "calib/mu_w": 0.7557057057057057, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.3381805555555556, "calib/std_conf": 0.33332895367493165, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.36321956204379563, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.07578211224459885, "calib/step_q_w": 0.2874374497991968, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 453.83984375, "completions/mean_terminated_length": 453.83984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.08746666666666666, "grad_norm": 0.02251720428466797, "kl": 0.1554718017578125, "learning_rate": 3.277777777777778e-06, "loss": -0.0063, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03680259734392166, "mask/share_reasoning": 0.838762640953064, "mask/share_step_conf": 0.12443475425243378, "num_tokens": 19543008.0, "reward": 0.519533097743988, "reward_std": 0.30813273787498474, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.573143720626831, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.18154752254486084, "step": 82 }, { "adv/mean_abs_final_conf": 0.6357418298721313, "adv/mean_abs_reasoning": 0.5618091225624084, "adv/mean_abs_step_conf": 0.619588315486908, "adv/ratio_final_to_reasoning": 1.1315975557187754, "adv/ratio_step_to_reasoning": 1.10284488201432, "adv/std_final_conf": 0.8436514735221863, "adv/std_reasoning": 0.826684832572937, "adv/std_step_conf": 0.8600687384605408, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.6373084112149533, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.3270833333333333, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.24517856697819318, "calib/mean_conf": 0.7948419540229884, "calib/mu_c": 0.9079200000000001, "calib/mu_w": 0.6627414330218069, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.29156609195402294, "calib/std_conf": 0.3745497602644934, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.35335213068181814, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.06772584658688369, "calib/step_q_w": 0.28562628409493446, "calib/step_q_w_n": 941.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 534.8984375, "completions/mean_terminated_length": 539.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.08853333333333334, "grad_norm": 0.022823499515652657, "kl": 0.13360595703125, "learning_rate": 3.2500000000000002e-06, "loss": -0.1309, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.030919015407562256, "mask/share_reasoning": 0.8423185348510742, "mask/share_step_conf": 0.11894996464252472, "num_tokens": 19787206.0, "reward": 0.5479799509048462, "reward_std": 0.2878418266773224, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5988320112228394, "rewards/format_reward_step": 0.890625, "rewards/step_margin_reward": 0.22056543827056885, "step": 83 }, { "adv/mean_abs_final_conf": 0.7809598445892334, "adv/mean_abs_reasoning": 0.6179355382919312, "adv/mean_abs_step_conf": 0.6307308673858643, "adv/ratio_final_to_reasoning": 1.2638208942439635, "adv/ratio_step_to_reasoning": 1.0207065758498068, "adv/std_final_conf": 0.9226217269897461, "adv/std_reasoning": 0.8749102354049683, "adv/std_step_conf": 0.8442380428314209, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.708725110619469, "calib/avg_num_step_conf": 5.46875, "calib/ece": 0.35283748271092674, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.7261410788381742, "calib/gap": 0.31572523737094393, "calib/mean_conf": 0.7873561549100968, "calib/mu_c": 0.9550442477876105, "calib/mu_w": 0.6393190104166666, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.33565698478561556, "calib/std_conf": 0.37556390931118633, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.3952507418397626, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.020861852950873727, "calib/step_q_w": 0.3743888888888889, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 443.9921875, "completions/mean_terminated_length": 443.9921875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0896, "grad_norm": 0.03345242515206337, "kl": 0.149383544921875, "learning_rate": 3.2222222222222227e-06, "loss": 0.0009, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03752202168107033, "mask/share_reasoning": 0.8343214988708496, "mask/share_step_conf": 0.12815645337104797, "num_tokens": 20006788.0, "reward": 0.5079925060272217, "reward_std": 0.32121214270591736, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.610805869102478, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.1325230449438095, "step": 84 }, { "adv/mean_abs_final_conf": 0.7412175536155701, "adv/mean_abs_reasoning": 0.6007823944091797, "adv/mean_abs_step_conf": 0.5902255773544312, "adv/ratio_final_to_reasoning": 1.2337537859186052, "adv/ratio_step_to_reasoning": 0.9824282183482919, "adv/std_final_conf": 0.9051445722579956, "adv/std_reasoning": 0.8431324362754822, "adv/std_step_conf": 0.8278036117553711, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.7159264346764347, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.269, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.7217391304347827, "calib/gap": 0.3659218559218559, "calib/mean_conf": 0.7769999999999999, "calib/mu_c": 0.9424603174603174, "calib/mu_w": 0.5765384615384616, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.24908695652173915, "calib/std_conf": 0.3845541178627755, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.3781427399903522, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.07106428524421537, "calib/step_q_w": 0.3070784547461368, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 486.07421875, "completions/mean_terminated_length": 491.83795166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.09066666666666667, "grad_norm": 0.021060096099972725, "kl": 0.1321258544921875, "learning_rate": 3.1944444444444443e-06, "loss": -0.1024, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.03601938486099243, "mask/share_reasoning": 0.827203631401062, "mask/share_step_conf": 0.12505821883678436, "num_tokens": 20239047.0, "reward": 0.573265790939331, "reward_std": 0.3232027590274811, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6436421871185303, "rewards/format_reward_step": 0.87890625, "rewards/step_margin_reward": 0.22867068648338318, "step": 85 }, { "adv/mean_abs_final_conf": 0.7023019790649414, "adv/mean_abs_reasoning": 0.6561944484710693, "adv/mean_abs_step_conf": 0.5927402973175049, "adv/ratio_final_to_reasoning": 1.0702650421705067, "adv/ratio_step_to_reasoning": 0.9032997744778055, "adv/std_final_conf": 0.8754962086677551, "adv/std_reasoning": 0.8748857975006104, "adv/std_step_conf": 0.8273710608482361, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6892927446569179, "calib/avg_num_step_conf": 6.17578125, "calib/ece": 0.2995397489539749, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5774058577405857, "calib/gap": 0.29501616985376844, "calib/mean_conf": 0.6590376569037657, "calib/mu_c": 0.8158035714285715, "calib/mu_w": 0.5207874015748031, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.24497907949790795, "calib/std_conf": 0.4249296208578263, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.38488954344624454, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.10380831654306644, "calib/step_q_w": 0.2810812269031781, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 487.515625, "completions/mean_terminated_length": 489.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.09173333333333333, "grad_norm": 0.034253645688295364, "kl": 0.13690185546875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0628, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.036123938858509064, "mask/share_reasoning": 0.8319493532180786, "mask/share_step_conf": 0.12802043557167053, "num_tokens": 20469363.0, "reward": 0.5517759919166565, "reward_std": 0.28997209668159485, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6238065958023071, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.20943285524845123, "step": 86 }, { "adv/mean_abs_final_conf": 0.6986506581306458, "adv/mean_abs_reasoning": 0.6235262155532837, "adv/mean_abs_step_conf": 0.6891388893127441, "adv/ratio_final_to_reasoning": 1.120483214183225, "adv/ratio_step_to_reasoning": 1.1052284124112397, "adv/std_final_conf": 0.8822417259216309, "adv/std_reasoning": 0.8432462811470032, "adv/std_step_conf": 0.8760759234428406, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6132382382382383, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.2767372881355933, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.7161016949152542, "calib/gap": 0.15632465799132456, "calib/mean_conf": 0.7955508474576272, "calib/mu_c": 0.8445679012345678, "calib/mu_w": 0.6882432432432433, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19292372881355938, "calib/std_conf": 0.35817704876498396, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3530600161225312, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.011983367281176494, "calib/step_q_w": 0.3410766488413547, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 410.04296875, "completions/mean_terminated_length": 416.5516052246094, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.0928, "grad_norm": 0.024705827236175537, "kl": 0.155242919921875, "learning_rate": 3.138888888888889e-06, "loss": -0.0794, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.04058319330215454, "mask/share_reasoning": 0.8201268911361694, "mask/share_step_conf": 0.12366492301225662, "num_tokens": 20679830.0, "reward": 0.601205587387085, "reward_std": 0.2902645468711853, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6523386836051941, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.24069753289222717, "step": 87 }, { "adv/mean_abs_final_conf": 0.730735719203949, "adv/mean_abs_reasoning": 0.6282634735107422, "adv/mean_abs_step_conf": 0.671043872833252, "adv/ratio_final_to_reasoning": 1.163103936507069, "adv/ratio_step_to_reasoning": 1.0680930869392302, "adv/std_final_conf": 0.9192438125610352, "adv/std_reasoning": 0.8431025743484497, "adv/std_step_conf": 0.8757964372634888, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.825514291533709, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.16649051490514905, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5894308943089431, "calib/gap": 0.529289383755403, "calib/mean_conf": 0.6685501355013549, "calib/mu_c": 0.89016317016317, "calib/mu_w": 0.360873786407767, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.126869918699187, "calib/std_conf": 0.42634782608254423, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.3533555040556199, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.04100983342743647, "calib/step_q_w": 0.3123456706281834, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 482.9453125, "completions/mean_terminated_length": 484.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.09386666666666667, "grad_norm": 0.027700036764144897, "kl": 0.1410064697265625, "learning_rate": 3.1111111111111116e-06, "loss": -0.0367, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0347808338701725, "mask/share_reasoning": 0.836738109588623, "mask/share_step_conf": 0.12457481026649475, "num_tokens": 20913312.0, "reward": 0.650510311126709, "reward_std": 0.28270044922828674, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7732701301574707, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.22696925699710846, "step": 88 }, { "adv/mean_abs_final_conf": 0.7559940814971924, "adv/mean_abs_reasoning": 0.5650864839553833, "adv/mean_abs_step_conf": 0.6165207624435425, "adv/ratio_final_to_reasoning": 1.3378378406886162, "adv/ratio_step_to_reasoning": 1.0910201888534645, "adv/std_final_conf": 0.9213818907737732, "adv/std_reasoning": 0.8430148959159851, "adv/std_step_conf": 0.844292402267456, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.7593352039715087, "calib/avg_num_step_conf": 5.37109375, "calib/ece": 0.22322033898305088, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.4406779661016949, "calib/gap": 0.4054622634721923, "calib/mean_conf": 0.561864406779661, "calib/mu_c": 0.7731858407079646, "calib/mu_w": 0.3677235772357723, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.153135593220339, "calib/std_conf": 0.43379962794785676, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.41738006535947714, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.1021907687233915, "calib/step_q_w": 0.31518929663608564, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 484.66015625, "completions/mean_terminated_length": 486.5608215332031, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 0.09493333333333333, "grad_norm": 0.02516189031302929, "kl": 0.1395263671875, "learning_rate": 3.0833333333333336e-06, "loss": -0.119, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.03532232344150543, "mask/share_reasoning": 0.8396503925323486, "mask/share_step_conf": 0.12112107872962952, "num_tokens": 21146273.0, "reward": 0.6061965227127075, "reward_std": 0.28105542063713074, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6944777965545654, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.24682149291038513, "step": 89 }, { "adv/mean_abs_final_conf": 0.7514957189559937, "adv/mean_abs_reasoning": 0.5751931667327881, "adv/mean_abs_step_conf": 0.6520944833755493, "adv/ratio_final_to_reasoning": 1.3065101646193733, "adv/ratio_step_to_reasoning": 1.1336965059574264, "adv/std_final_conf": 0.9219711422920227, "adv/std_reasoning": 0.8267685770988464, "adv/std_step_conf": 0.860261857509613, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7084565345949143, "calib/avg_num_step_conf": 6.703125, "calib/ece": 0.24481327800829875, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5435684647302904, "calib/gap": 0.3146791839148432, "calib/mean_conf": 0.6481327800829875, "calib/mu_c": 0.7643421052631577, "calib/mu_w": 0.44966292134831454, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.13112033195020745, "calib/std_conf": 0.418470142755428, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.3826992038767739, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.08950283380151941, "calib/step_q_w": 0.2931963700752545, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 499.35546875, "completions/mean_terminated_length": 499.35546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.096, "grad_norm": 0.030875807628035545, "kl": 0.1427459716796875, "learning_rate": 3.055555555555556e-06, "loss": -0.0019, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03474542498588562, "mask/share_reasoning": 0.8275556564331055, "mask/share_step_conf": 0.1376989185810089, "num_tokens": 21377428.0, "reward": 0.6205670833587646, "reward_std": 0.28211313486099243, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6910874843597412, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.243796706199646, "step": 90 }, { "adv/mean_abs_final_conf": 0.7292274236679077, "adv/mean_abs_reasoning": 0.6552180051803589, "adv/mean_abs_step_conf": 0.6420432329177856, "adv/ratio_final_to_reasoning": 1.1129538839018573, "adv/ratio_step_to_reasoning": 0.9798925362880608, "adv/std_final_conf": 0.8911682963371277, "adv/std_reasoning": 0.8592181205749512, "adv/std_step_conf": 0.8442062735557556, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6807432432432432, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.2565702479338844, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.512396694214876, "calib/gap": 0.2879298447383554, "calib/mean_conf": 0.6234297520661156, "calib/mu_c": 0.7352702702702704, "calib/mu_w": 0.44734042553191494, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13421487603305793, "calib/std_conf": 0.422093627278587, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.38535048309178743, "calib/step_q_c_n": 828.0, "calib/step_q_gap": 0.06144196011097536, "calib/step_q_w": 0.3239085229808121, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 481.3203125, "completions/mean_terminated_length": 487.0276794433594, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.09706666666666666, "grad_norm": 0.03341059759259224, "kl": 0.1559906005859375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0832, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03211017698049545, "mask/share_reasoning": 0.8307291865348816, "mask/share_step_conf": 0.12544187903404236, "num_tokens": 21608358.0, "reward": 0.5959835052490234, "reward_std": 0.2786335349082947, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6737020015716553, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.21514007449150085, "step": 91 }, { "adv/mean_abs_final_conf": 0.7283531427383423, "adv/mean_abs_reasoning": 0.6388962864875793, "adv/mean_abs_step_conf": 0.6886264085769653, "adv/ratio_final_to_reasoning": 1.1400178059299175, "adv/ratio_step_to_reasoning": 1.077837550696662, "adv/std_final_conf": 0.8974381685256958, "adv/std_reasoning": 0.8747364282608032, "adv/std_step_conf": 0.876107931137085, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.736181238373638, "calib/avg_num_step_conf": 5.8359375, "calib/ece": 0.2368548387096773, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5524193548387096, "calib/gap": 0.3382354504384799, "calib/mean_conf": 0.6595161290322581, "calib/mu_c": 0.8040845070422535, "calib/mu_w": 0.4658490566037736, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16189516129032247, "calib/std_conf": 0.41659304587197155, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3574724285104567, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.043976869838179944, "calib/step_q_w": 0.31349555867227674, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 467.015625, "completions/mean_terminated_length": 467.015625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.09813333333333334, "grad_norm": 0.02376173995435238, "kl": 0.1473236083984375, "learning_rate": 3e-06, "loss": -0.0482, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0371013879776001, "mask/share_reasoning": 0.8304646611213684, "mask/share_step_conf": 0.13243398070335388, "num_tokens": 21834634.0, "reward": 0.6363959908485413, "reward_std": 0.2901349663734436, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7092921733856201, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.26037484407424927, "step": 92 }, { "adv/mean_abs_final_conf": 0.705845832824707, "adv/mean_abs_reasoning": 0.5151506662368774, "adv/mean_abs_step_conf": 0.6637080907821655, "adv/ratio_final_to_reasoning": 1.3701735804416952, "adv/ratio_step_to_reasoning": 1.2883766522724018, "adv/std_final_conf": 0.8816631436347961, "adv/std_reasoning": 0.7756566405296326, "adv/std_step_conf": 0.8602451086044312, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7549019607843138, "calib/avg_num_step_conf": 7.55859375, "calib/ece": 0.19257261410788384, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4190871369294606, "calib/gap": 0.36871288515406175, "calib/mean_conf": 0.5759751037344398, "calib/mu_c": 0.7366176470588236, "calib/mu_w": 0.36790476190476185, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10211618257261414, "calib/std_conf": 0.4081879593625438, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.38613205268935236, "calib/step_q_c_n": 911.0, "calib/step_q_gap": 0.12965457873101904, "calib/step_q_w": 0.2564774739583333, "calib/step_q_w_n": 1024.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 529.91015625, "completions/mean_terminated_length": 529.91015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0992, "grad_norm": 0.04444187134504318, "kl": 0.136962890625, "learning_rate": 2.9722222222222225e-06, "loss": 0.0925, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.033977434039115906, "mask/share_reasoning": 0.815356969833374, "mask/share_step_conf": 0.15066558122634888, "num_tokens": 22076067.0, "reward": 0.6086573600769043, "reward_std": 0.2560272812843323, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7197816371917725, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.20456430315971375, "step": 93 }, { "adv/mean_abs_final_conf": 0.732485294342041, "adv/mean_abs_reasoning": 0.5946255922317505, "adv/mean_abs_step_conf": 0.6553810834884644, "adv/ratio_final_to_reasoning": 1.2318428670264177, "adv/ratio_step_to_reasoning": 1.1021743632471084, "adv/std_final_conf": 0.9107852578163147, "adv/std_reasoning": 0.8267837762832642, "adv/std_step_conf": 0.8603094220161438, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.72498634998635, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.2313925925925926, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5185185185185185, "calib/gap": 0.36048714168714163, "calib/mean_conf": 0.6247390946502057, "calib/mu_c": 0.7894060606060606, "calib/mu_w": 0.4289189189189189, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15646090534979423, "calib/std_conf": 0.42964667385699845, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.41655879494655, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.09046864847517849, "calib/step_q_w": 0.3260901464713715, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 444.9609375, "completions/mean_terminated_length": 446.7059020996094, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.10026666666666667, "grad_norm": 0.023827511817216873, "kl": 0.1639862060546875, "learning_rate": 2.944444444444445e-06, "loss": -0.0212, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.037360891699790955, "mask/share_reasoning": 0.8292186260223389, "mask/share_step_conf": 0.12951421737670898, "num_tokens": 22298657.0, "reward": 0.585480809211731, "reward_std": 0.2746550440788269, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7019674777984619, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.17524418234825134, "step": 94 }, { "adv/mean_abs_final_conf": 0.6834286451339722, "adv/mean_abs_reasoning": 0.5906679630279541, "adv/mean_abs_step_conf": 0.6076480150222778, "adv/ratio_final_to_reasoning": 1.1570436995270523, "adv/ratio_step_to_reasoning": 1.0287472032633673, "adv/std_final_conf": 0.8915984630584717, "adv/std_reasoning": 0.859029233455658, "adv/std_step_conf": 0.8425108194351196, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7615390011223344, "calib/avg_num_step_conf": 6.33984375, "calib/ece": 0.220164609053498, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5925925925925926, "calib/gap": 0.4189078282828283, "calib/mean_conf": 0.6632921810699588, "calib/mu_c": 0.8339583333333334, "calib/mu_w": 0.41505050505050506, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14543209876543214, "calib/std_conf": 0.430606645738895, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.37464257206208423, "calib/step_q_c_n": 902.0, "calib/step_q_gap": 0.05460789799828397, "calib/step_q_w": 0.32003467406380026, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 493.8046875, "completions/mean_terminated_length": 493.8046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.10133333333333333, "grad_norm": 0.024645326659083366, "kl": 0.1318206787109375, "learning_rate": 2.916666666666667e-06, "loss": 0.0285, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03597475588321686, "mask/share_reasoning": 0.826029896736145, "mask/share_step_conf": 0.13799530267715454, "num_tokens": 22531199.0, "reward": 0.6219439506530762, "reward_std": 0.2513583302497864, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7273960709571838, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.21414805948734283, "step": 95 }, { "adv/mean_abs_final_conf": 0.6429899334907532, "adv/mean_abs_reasoning": 0.5211592316627502, "adv/mean_abs_step_conf": 0.6416047215461731, "adv/ratio_final_to_reasoning": 1.2337686726555799, "adv/ratio_step_to_reasoning": 1.2311107288633139, "adv/std_final_conf": 0.8641905784606934, "adv/std_reasoning": 0.7929425835609436, "adv/std_step_conf": 0.8759558796882629, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8074252136752137, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.15203252032520323, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.48409340659340655, "calib/mean_conf": 0.734959349593496, "calib/mu_c": 0.8884523809523809, "calib/mu_w": 0.40435897435897433, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10203252032520324, "calib/std_conf": 0.39943532090103995, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.42555579171094576, "calib/step_q_c_n": 941.0, "calib/step_q_gap": 0.10935501522730745, "calib/step_q_w": 0.3162007764836383, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 449.78515625, "completions/mean_terminated_length": 451.5490417480469, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1024, "grad_norm": 0.025421667844057083, "kl": 0.169464111328125, "learning_rate": 2.888888888888889e-06, "loss": -0.0997, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036100100725889206, "mask/share_reasoning": 0.8264784216880798, "mask/share_step_conf": 0.13351520895957947, "num_tokens": 22752160.0, "reward": 0.6261996030807495, "reward_std": 0.2512551546096802, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7869394421577454, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.144366055727005, "step": 96 }, { "adv/mean_abs_final_conf": 0.7186359763145447, "adv/mean_abs_reasoning": 0.5297770500183105, "adv/mean_abs_step_conf": 0.6950712203979492, "adv/ratio_final_to_reasoning": 1.3564875569632673, "adv/ratio_step_to_reasoning": 1.312007042158443, "adv/std_final_conf": 0.8968744277954102, "adv/std_reasoning": 0.7928928136825562, "adv/std_step_conf": 0.8915084600448608, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7212833462833463, "calib/avg_num_step_conf": 6.421875, "calib/ece": 0.24276947791164655, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.570281124497992, "calib/gap": 0.3601947163947164, "calib/mean_conf": 0.6778730923694778, "calib/mu_c": 0.8471212121212122, "calib/mu_w": 0.48692649572649577, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1952610441767068, "calib/std_conf": 0.41529027806958346, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43279647435897434, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.15761153986115772, "calib/step_q_w": 0.2751849344978166, "calib/step_q_w_n": 916.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 440.671875, "completions/mean_terminated_length": 444.1417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.10346666666666667, "grad_norm": 0.0416029617190361, "kl": 0.1555328369140625, "learning_rate": 2.861111111111111e-06, "loss": -0.0557, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.037551771849393845, "mask/share_reasoning": 0.8153103590011597, "mask/share_step_conf": 0.13932538032531738, "num_tokens": 22970044.0, "reward": 0.6227738261222839, "reward_std": 0.28734835982322693, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7159277200698853, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.23196370899677277, "step": 97 }, { "adv/mean_abs_final_conf": 0.7329667210578918, "adv/mean_abs_reasoning": 0.6135759949684143, "adv/mean_abs_step_conf": 0.6760110259056091, "adv/ratio_final_to_reasoning": 1.194581807418368, "adv/ratio_step_to_reasoning": 1.1017559869505795, "adv/std_final_conf": 0.897504448890686, "adv/std_reasoning": 0.8430263996124268, "adv/std_step_conf": 0.9067613482475281, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6668100310990539, "calib/avg_num_step_conf": 6.07421875, "calib/ece": 0.3134146341463416, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.28540858863230323, "calib/mean_conf": 0.7239837398373985, "calib/mu_c": 0.8620472440944882, "calib/mu_w": 0.576638655462185, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.260569105691057, "calib/std_conf": 0.4137283186909335, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.3791700942587832, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.07158051896920792, "calib/step_q_w": 0.30758957528957526, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 492.71875, "completions/mean_terminated_length": 492.71875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10453333333333334, "grad_norm": 0.018140403553843498, "kl": 0.1438140869140625, "learning_rate": 2.8333333333333335e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036345966160297394, "mask/share_reasoning": 0.8325316905975342, "mask/share_step_conf": 0.13112229108810425, "num_tokens": 23202364.0, "reward": 0.5608394145965576, "reward_std": 0.30682680010795593, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6480875015258789, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.18296632170677185, "step": 98 }, { "adv/mean_abs_final_conf": 0.7341911792755127, "adv/mean_abs_reasoning": 0.574193000793457, "adv/mean_abs_step_conf": 0.5314733982086182, "adv/ratio_final_to_reasoning": 1.2786487788269099, "adv/ratio_step_to_reasoning": 0.9256006211747511, "adv/std_final_conf": 0.8968331813812256, "adv/std_reasoning": 0.8101562857627869, "adv/std_step_conf": 0.776594877243042, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.7210161314638928, "calib/avg_num_step_conf": 7.140625, "calib/ece": 0.2821888412017167, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.4291845493562232, "calib/gap": 0.3625629428614503, "calib/mean_conf": 0.5398712446351932, "calib/mu_c": 0.7483838383838384, "calib/mu_w": 0.3858208955223881, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.19858369098712447, "calib/std_conf": 0.4464773717758751, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.36605466867469877, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.08826291609737918, "calib/step_q_w": 0.2777917525773196, "calib/step_q_w_n": 1164.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2917.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 596.578125, "completions/mean_terminated_length": 606.0476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1056, "grad_norm": 0.026684636250138283, "kl": 0.1192474365234375, "learning_rate": 2.805555555555556e-06, "loss": -0.1199, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.030056292191147804, "mask/share_reasoning": 0.8327727913856506, "mask/share_step_conf": 0.12154591828584671, "num_tokens": 23460888.0, "reward": 0.5144092440605164, "reward_std": 0.254136323928833, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6555582284927368, "rewards/format_reward_step": 0.90234375, "rewards/step_margin_reward": 0.11466653645038605, "step": 99 }, { "adv/mean_abs_final_conf": 0.7626749277114868, "adv/mean_abs_reasoning": 0.6028633713722229, "adv/mean_abs_step_conf": 0.6579493284225464, "adv/ratio_final_to_reasoning": 1.2650875205363774, "adv/ratio_step_to_reasoning": 1.091373866229322, "adv/std_final_conf": 0.9262394905090332, "adv/std_reasoning": 0.8429723381996155, "adv/std_step_conf": 0.8603565096855164, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7521527777777777, "calib/avg_num_step_conf": 6.51171875, "calib/ece": 0.26073611111111106, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5708333333333333, "calib/gap": 0.3866944444444445, "calib/mean_conf": 0.6394305555555556, "calib/mu_c": 0.8327777777777778, "calib/mu_w": 0.44608333333333333, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20008333333333328, "calib/std_conf": 0.43805092274552954, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3679578559027778, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.04721258337774997, "calib/step_q_w": 0.3207452725250278, "calib/step_q_w_n": 899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 532.53515625, "completions/mean_terminated_length": 538.849853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 63.0, "epoch": 0.10666666666666667, "grad_norm": 0.028963034972548485, "kl": 0.1346282958984375, "learning_rate": 2.7777777777777783e-06, "loss": 0.0506, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.034075599163770676, "mask/share_reasoning": 0.8251175880432129, "mask/share_step_conf": 0.12908801436424255, "num_tokens": 23704625.0, "reward": 0.585178017616272, "reward_std": 0.29357630014419556, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6823604106903076, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.20752683281898499, "step": 100 }, { "adv/mean_abs_final_conf": 0.7027376294136047, "adv/mean_abs_reasoning": 0.5980355739593506, "adv/mean_abs_step_conf": 0.6533796787261963, "adv/ratio_final_to_reasoning": 1.1750766342561603, "adv/ratio_step_to_reasoning": 1.092543164949929, "adv/std_final_conf": 0.8762261271476746, "adv/std_reasoning": 0.8268650770187378, "adv/std_step_conf": 0.8731431365013123, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7106889204545455, "calib/avg_num_step_conf": 6.79296875, "calib/ece": 0.28050420168067225, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.4495798319327731, "calib/gap": 0.35029545454545463, "calib/mean_conf": 0.5081512605042017, "calib/mu_c": 0.6965454545454546, "calib/mu_w": 0.34624999999999995, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1632352941176471, "calib/std_conf": 0.46424143672666857, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.3425476433121019, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.010807601383380716, "calib/step_q_w": 0.3317400419287212, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 586.52734375, "completions/mean_terminated_length": 588.8275146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.10773333333333333, "grad_norm": 0.02456309087574482, "kl": 0.1267242431640625, "learning_rate": 2.7500000000000004e-06, "loss": 0.049, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.030890878289937973, "mask/share_reasoning": 0.8370897769927979, "mask/share_step_conf": 0.12811307609081268, "num_tokens": 23961768.0, "reward": 0.582810640335083, "reward_std": 0.2614016830921173, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6503531336784363, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.24495576322078705, "step": 101 }, { "adv/mean_abs_final_conf": 0.5911612510681152, "adv/mean_abs_reasoning": 0.4206992983818054, "adv/mean_abs_step_conf": 0.5980938076972961, "adv/ratio_final_to_reasoning": 1.4051871570548882, "adv/ratio_step_to_reasoning": 1.4216658073779254, "adv/std_final_conf": 0.8265983462333679, "adv/std_reasoning": 0.681780219078064, "adv/std_step_conf": 0.8105750679969788, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.722962962962963, "calib/avg_num_step_conf": 6.70703125, "calib/ece": 0.24702811244979922, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.3666666666666666, "calib/mean_conf": 0.6842168674698795, "calib/mu_c": 0.83, "calib/mu_w": 0.4633333333333334, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16441767068273097, "calib/std_conf": 0.42517311517557044, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3707850055126792, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.0813254581875763, "calib/step_q_w": 0.2894595473251029, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 457.0390625, "completions/mean_terminated_length": 460.6377868652344, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1088, "grad_norm": 0.03056369163095951, "kl": 0.1494903564453125, "learning_rate": 2.7222222222222224e-06, "loss": -0.0918, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03953394293785095, "mask/share_reasoning": 0.8042321801185608, "mask/share_step_conf": 0.14842136204242706, "num_tokens": 24185466.0, "reward": 0.5922073721885681, "reward_std": 0.23481258749961853, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7250789403915405, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.14839836955070496, "step": 102 }, { "adv/mean_abs_final_conf": 0.6045788526535034, "adv/mean_abs_reasoning": 0.4766625165939331, "adv/mean_abs_step_conf": 0.5854978561401367, "adv/ratio_final_to_reasoning": 1.2683582862223288, "adv/ratio_step_to_reasoning": 1.2283278750842497, "adv/std_final_conf": 0.844353973865509, "adv/std_reasoning": 0.7576423287391663, "adv/std_step_conf": 0.8277443647384644, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7934989788972089, "calib/avg_num_step_conf": 7.40234375, "calib/ece": 0.2069362139917695, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5596707818930041, "calib/gap": 0.47913950306330844, "calib/mean_conf": 0.6244218106995885, "calib/mu_c": 0.8472315384615385, "calib/mu_w": 0.3680920353982301, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1481893004115226, "calib/std_conf": 0.44728296405722406, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.37132728187744457, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.09995898702842149, "calib/step_q_w": 0.2713682948490231, "calib/step_q_w_n": 1126.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 578.1875, "completions/mean_terminated_length": 587.3651123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.10986666666666667, "grad_norm": 0.039372753351926804, "kl": 0.1265411376953125, "learning_rate": 2.6944444444444444e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03180355578660965, "mask/share_reasoning": 0.8254137635231018, "mask/share_step_conf": 0.12715765833854675, "num_tokens": 24438034.0, "reward": 0.6127403974533081, "reward_std": 0.24559491872787476, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7271147966384888, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.2108660787343979, "step": 103 }, { "adv/mean_abs_final_conf": 0.7261596918106079, "adv/mean_abs_reasoning": 0.5250190496444702, "adv/mean_abs_step_conf": 0.5984550714492798, "adv/ratio_final_to_reasoning": 1.3831111314957907, "adv/ratio_step_to_reasoning": 1.139873061471843, "adv/std_final_conf": 0.8972943425178528, "adv/std_reasoning": 0.7928174734115601, "adv/std_step_conf": 0.8275766968727112, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7644719251336898, "calib/avg_num_step_conf": 6.36328125, "calib/ece": 0.21361788617886177, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3699186991869919, "calib/gap": 0.4705962566844919, "calib/mean_conf": 0.43410569105691055, "calib/mu_c": 0.6942727272727272, "calib/mu_w": 0.22367647058823528, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10028455284552842, "calib/std_conf": 0.4575560441572679, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.42492398119122254, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.07767541072368134, "calib/step_q_w": 0.3472485704675412, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 525.37109375, "completions/mean_terminated_length": 527.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.11093333333333333, "grad_norm": 0.0321781225502491, "kl": 0.1436614990234375, "learning_rate": 2.666666666666667e-06, "loss": -0.0172, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03203163295984268, "mask/share_reasoning": 0.8348712921142578, "mask/share_step_conf": 0.12919080257415771, "num_tokens": 24679209.0, "reward": 0.6384133100509644, "reward_std": 0.24226495623588562, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7417183518409729, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.25776445865631104, "step": 104 }, { "adv/mean_abs_final_conf": 0.7585700154304504, "adv/mean_abs_reasoning": 0.6300908923149109, "adv/mean_abs_step_conf": 0.6589280366897583, "adv/ratio_final_to_reasoning": 1.2039056978645033, "adv/ratio_step_to_reasoning": 1.0457666421250777, "adv/std_final_conf": 0.9060263633728027, "adv/std_reasoning": 0.8591178059577942, "adv/std_step_conf": 0.8760077953338623, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7042738970588235, "calib/avg_num_step_conf": 6.51953125, "calib/ece": 0.264412955465587, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4939271255060729, "calib/gap": 0.35177849264705885, "calib/mean_conf": 0.567004048582996, "calib/mu_c": 0.7364843750000001, "calib/mu_w": 0.38470588235294123, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1565991902834008, "calib/std_conf": 0.4546974034768048, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3779455334987593, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.0923137606134754, "calib/step_q_w": 0.2856317728852839, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 525.06640625, "completions/mean_terminated_length": 525.06640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.112, "grad_norm": 0.03275076299905777, "kl": 0.1414337158203125, "learning_rate": 2.6388888888888893e-06, "loss": -0.0095, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03425367921590805, "mask/share_reasoning": 0.8308181166648865, "mask/share_step_conf": 0.13492822647094727, "num_tokens": 24919386.0, "reward": 0.5891318917274475, "reward_std": 0.2612612247467041, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6877496242523193, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.19832667708396912, "step": 105 }, { "adv/mean_abs_final_conf": 0.6901768445968628, "adv/mean_abs_reasoning": 0.5217985510826111, "adv/mean_abs_step_conf": 0.5970039963722229, "adv/ratio_final_to_reasoning": 1.3226883117343, "adv/ratio_step_to_reasoning": 1.1441273555351543, "adv/std_final_conf": 0.8845764994621277, "adv/std_reasoning": 0.7754665017127991, "adv/std_step_conf": 0.8276647329330444, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8172105672105672, "calib/avg_num_step_conf": 6.58203125, "calib/ece": 0.20381445783132535, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.43373493975903615, "calib/gap": 0.4886016317016317, "calib/mean_conf": 0.5239767068273092, "calib/mu_c": 0.753560606060606, "calib/mu_w": 0.26495897435897436, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09883534136546188, "calib/std_conf": 0.45357286802235264, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.36024539877300615, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.016352418335721874, "calib/step_q_w": 0.3438929804372843, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 507.08984375, "completions/mean_terminated_length": 507.08984375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.11306666666666666, "grad_norm": 0.03155747428536415, "kl": 0.137786865234375, "learning_rate": 2.6111111111111113e-06, "loss": 0.058, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03507569804787636, "mask/share_reasoning": 0.8297785520553589, "mask/share_step_conf": 0.13514570891857147, "num_tokens": 25153785.0, "reward": 0.653069257736206, "reward_std": 0.22058628499507904, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7580972909927368, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.25272873044013977, "step": 106 }, { "adv/mean_abs_final_conf": 0.7676753997802734, "adv/mean_abs_reasoning": 0.5727545022964478, "adv/mean_abs_step_conf": 0.6776283383369446, "adv/ratio_final_to_reasoning": 1.3403218948123397, "adv/ratio_step_to_reasoning": 1.1831043415983764, "adv/std_final_conf": 0.9092914462089539, "adv/std_reasoning": 0.8099678754806519, "adv/std_step_conf": 0.8760293126106262, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6613672496025437, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.28372469635627534, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4979757085020243, "calib/gap": 0.28191971383147846, "calib/mean_conf": 0.5919838056680162, "calib/mu_c": 0.7186764705882352, "calib/mu_w": 0.4367567567567568, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1625506072874494, "calib/std_conf": 0.43690078889020717, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.35915129280223623, "calib/step_q_c_n": 954.0, "calib/step_q_gap": 0.02088309306504832, "calib/step_q_w": 0.3382681997371879, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 500.25390625, "completions/mean_terminated_length": 500.25390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.11413333333333334, "grad_norm": 0.04856055602431297, "kl": 0.1526031494140625, "learning_rate": 2.5833333333333337e-06, "loss": -0.0344, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03367823362350464, "mask/share_reasoning": 0.822372555732727, "mask/share_step_conf": 0.14394915103912354, "num_tokens": 25386466.0, "reward": 0.6302648782730103, "reward_std": 0.252381831407547, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6748914122581482, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.28641968965530396, "step": 107 }, { "adv/mean_abs_final_conf": 0.654589056968689, "adv/mean_abs_reasoning": 0.5746661424636841, "adv/mean_abs_step_conf": 0.6928590536117554, "adv/ratio_final_to_reasoning": 1.139077124262729, "adv/ratio_step_to_reasoning": 1.2056723067786101, "adv/std_final_conf": 0.875133216381073, "adv/std_reasoning": 0.8266122341156006, "adv/std_step_conf": 0.8914769291877747, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7072101687486303, "calib/avg_num_step_conf": 7.12109375, "calib/ece": 0.2527466666666667, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.556, "calib/gap": 0.36600920447074303, "calib/mean_conf": 0.6285333333333333, "calib/mu_c": 0.7471203155818541, "calib/mu_w": 0.38111111111111107, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10264000000000006, "calib/std_conf": 0.44744954277922033, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3643041935483871, "calib/step_q_c_n": 1240.0, "calib/step_q_gap": 0.042662684114424865, "calib/step_q_w": 0.32164150943396225, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 528.72265625, "completions/mean_terminated_length": 530.7960815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.1152, "grad_norm": 0.03127816319465637, "kl": 0.1278228759765625, "learning_rate": 2.5555555555555557e-06, "loss": -0.038, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03343158960342407, "mask/share_reasoning": 0.8212839961051941, "mask/share_step_conf": 0.14137813448905945, "num_tokens": 25625051.0, "reward": 0.6690140962600708, "reward_std": 0.24003738164901733, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7176185846328735, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.29384708404541016, "step": 108 }, { "adv/mean_abs_final_conf": 0.6834123730659485, "adv/mean_abs_reasoning": 0.41160860657691956, "adv/mean_abs_step_conf": 0.5786731243133545, "adv/ratio_final_to_reasoning": 1.6603451972237502, "adv/ratio_step_to_reasoning": 1.4058819836781393, "adv/std_final_conf": 0.8892835974693298, "adv/std_reasoning": 0.73923259973526, "adv/std_step_conf": 0.8438445925712585, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8023218002081165, "calib/avg_num_step_conf": 7.87890625, "calib/ece": 0.19469919354838705, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3951612903225806, "calib/gap": 0.4922145161290321, "calib/mean_conf": 0.4683653225806451, "calib/mu_c": 0.7144725806451612, "calib/mu_w": 0.22225806451612903, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.08153225806451611, "calib/std_conf": 0.4570179747902128, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3821491745283019, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.09624498291153544, "calib/step_q_w": 0.28590419161676645, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 545.58203125, "completions/mean_terminated_length": 549.8779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.11626666666666667, "grad_norm": 0.03397579863667488, "kl": 0.13104248046875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0067, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031111977994441986, "mask/share_reasoning": 0.8199541568756104, "mask/share_step_conf": 0.14112132787704468, "num_tokens": 25869320.0, "reward": 0.6208849549293518, "reward_std": 0.18744871020317078, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7542101144790649, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.1992785781621933, "step": 109 }, { "adv/mean_abs_final_conf": 0.7636078596115112, "adv/mean_abs_reasoning": 0.5572937726974487, "adv/mean_abs_step_conf": 0.6667872667312622, "adv/ratio_final_to_reasoning": 1.3702070559939112, "adv/ratio_step_to_reasoning": 1.1964735645687123, "adv/std_final_conf": 0.9241266250610352, "adv/std_reasoning": 0.7756907939910889, "adv/std_step_conf": 0.8761153221130371, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6368500525210083, "calib/avg_num_step_conf": 5.97265625, "calib/ece": 0.3292307692307692, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3643724696356275, "calib/gap": 0.22762867647058826, "calib/mean_conf": 0.4749797570850202, "calib/mu_c": 0.5929411764705883, "calib/mu_w": 0.36531250000000004, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16121457489878543, "calib/std_conf": 0.452432807093334, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42202565687789795, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.07993570985484433, "calib/step_q_w": 0.3420899470230536, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 476.87109375, "completions/mean_terminated_length": 480.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.11733333333333333, "grad_norm": 0.042963579297065735, "kl": 0.1475067138671875, "learning_rate": 2.5e-06, "loss": -0.052, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03687353432178497, "mask/share_reasoning": 0.8216755390167236, "mask/share_step_conf": 0.1336384415626526, "num_tokens": 26096319.0, "reward": 0.5779798030853271, "reward_std": 0.24897190928459167, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6321703195571899, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.23863306641578674, "step": 110 }, { "adv/mean_abs_final_conf": 0.6640579700469971, "adv/mean_abs_reasoning": 0.46967190504074097, "adv/mean_abs_step_conf": 0.6402167677879333, "adv/ratio_final_to_reasoning": 1.4138762887880092, "adv/ratio_step_to_reasoning": 1.363114891303534, "adv/std_final_conf": 0.8714942336082458, "adv/std_reasoning": 0.7393086552619934, "adv/std_step_conf": 0.8442738652229309, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7487291904943449, "calib/avg_num_step_conf": 6.05078125, "calib/ece": 0.24609561752988052, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.46613545816733065, "calib/gap": 0.4163686618375905, "calib/mean_conf": 0.5394820717131474, "calib/mu_c": 0.741860465116279, "calib/mu_w": 0.3254918032786885, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13581673306772912, "calib/std_conf": 0.46152973025543237, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42224137931034483, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.1000151190524316, "calib/step_q_w": 0.32222626025791323, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 515.46484375, "completions/mean_terminated_length": 517.486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.1184, "grad_norm": 0.024928772822022438, "kl": 0.13892364501953125, "learning_rate": 2.4722222222222226e-06, "loss": -0.0477, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03475944325327873, "mask/share_reasoning": 0.8337557911872864, "mask/share_step_conf": 0.12757855653762817, "num_tokens": 26335686.0, "reward": 0.6076115369796753, "reward_std": 0.2294948846101761, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7280757427215576, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.19105350971221924, "step": 111 }, { "adv/mean_abs_final_conf": 0.6469699144363403, "adv/mean_abs_reasoning": 0.5888714790344238, "adv/mean_abs_step_conf": 0.54422926902771, "adv/ratio_final_to_reasoning": 1.098660637287411, "adv/ratio_step_to_reasoning": 0.924190232340826, "adv/std_final_conf": 0.8595841526985168, "adv/std_reasoning": 0.826600193977356, "adv/std_step_conf": 0.7939732670783997, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8530377668308702, "calib/avg_num_step_conf": 6.5, "calib/ece": 0.14024793388429757, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.34710743801652894, "calib/gap": 0.5953940886699507, "calib/mean_conf": 0.4239669421487603, "calib/mu_c": 0.7339655172413793, "calib/mu_w": 0.13857142857142854, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.042438016528925655, "calib/std_conf": 0.44830070217247653, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4299348769898698, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.15862922436910926, "calib/step_q_w": 0.27130565262076056, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 537.80859375, "completions/mean_terminated_length": 542.0433349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.11946666666666667, "grad_norm": 0.029871821403503418, "kl": 0.129302978515625, "learning_rate": 2.4444444444444447e-06, "loss": -0.0443, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03093784675002098, "mask/share_reasoning": 0.8343961834907532, "mask/share_step_conf": 0.12685343623161316, "num_tokens": 26581285.0, "reward": 0.6027513742446899, "reward_std": 0.19106586277484894, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7814507484436035, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.1482706367969513, "step": 112 }, { "adv/mean_abs_final_conf": 0.6840471029281616, "adv/mean_abs_reasoning": 0.5645737051963806, "adv/mean_abs_step_conf": 0.6825007200241089, "adv/ratio_final_to_reasoning": 1.211617007012793, "adv/ratio_step_to_reasoning": 1.2088779795132483, "adv/std_final_conf": 0.8864825367927551, "adv/std_reasoning": 0.8097827434539795, "adv/std_step_conf": 0.8916371464729309, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.761674870629812, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.23656126482213435, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4150197628458498, "calib/gap": 0.4204493247507257, "calib/mean_conf": 0.5016996047430831, "calib/mu_c": 0.6911510791366906, "calib/mu_w": 0.2707017543859649, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0944268774703557, "calib/std_conf": 0.45685888001099756, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.385746466083151, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.08138656270150851, "calib/step_q_w": 0.3043599033816425, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 474.44921875, "completions/mean_terminated_length": 474.44921875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.12053333333333334, "grad_norm": 0.04429532214999199, "kl": 0.1583099365234375, "learning_rate": 2.4166666666666667e-06, "loss": 0.0292, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035268235951662064, "mask/share_reasoning": 0.8200170397758484, "mask/share_step_conf": 0.14471471309661865, "num_tokens": 26807944.0, "reward": 0.661996603012085, "reward_std": 0.2428886592388153, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7369261980056763, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2815982699394226, "step": 113 }, { "adv/mean_abs_final_conf": 0.6316376328468323, "adv/mean_abs_reasoning": 0.4891188144683838, "adv/mean_abs_step_conf": 0.6227335333824158, "adv/ratio_final_to_reasoning": 1.2913787287723744, "adv/ratio_step_to_reasoning": 1.2731743596067877, "adv/std_final_conf": 0.8420522809028625, "adv/std_reasoning": 0.7392686605453491, "adv/std_step_conf": 0.8441986441612244, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7987742266718008, "calib/avg_num_step_conf": 6.83984375, "calib/ece": 0.18026350461133064, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5573122529644269, "calib/gap": 0.5093815513626834, "calib/mean_conf": 0.6343610013175229, "calib/mu_c": 0.8477777777777777, "calib/mu_w": 0.3383962264150943, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11679841897233197, "calib/std_conf": 0.4438325694398911, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4433333333333333, "calib/step_q_c_n": 981.0, "calib/step_q_gap": 0.11108917748917746, "calib/step_q_w": 0.33224415584415584, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 477.14453125, "completions/mean_terminated_length": 477.14453125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1216, "grad_norm": 0.026019670069217682, "kl": 0.1433258056640625, "learning_rate": 2.388888888888889e-06, "loss": -0.0034, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03754541277885437, "mask/share_reasoning": 0.8097862005233765, "mask/share_step_conf": 0.15266837179660797, "num_tokens": 27035117.0, "reward": 0.6479806900024414, "reward_std": 0.24471187591552734, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7953052520751953, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.1881560981273651, "step": 114 }, { "adv/mean_abs_final_conf": 0.733522355556488, "adv/mean_abs_reasoning": 0.5405679941177368, "adv/mean_abs_step_conf": 0.6315276622772217, "adv/ratio_final_to_reasoning": 1.356947439616126, "adv/ratio_step_to_reasoning": 1.1682668399706877, "adv/std_final_conf": 0.882422149181366, "adv/std_reasoning": 0.7755196690559387, "adv/std_step_conf": 0.8442544341087341, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6455424274973147, "calib/avg_num_step_conf": 6.21875, "calib/ece": 0.35670634920634925, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5595238095238095, "calib/gap": 0.2478151260504201, "calib/mean_conf": 0.6170238095238095, "calib/mu_c": 0.7478151260504201, "calib/mu_w": 0.5, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2507539682539683, "calib/std_conf": 0.45686504145072565, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4201792633015007, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.03574235992548208, "calib/step_q_w": 0.3844369033760186, "calib/step_q_w_n": 859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 459.77734375, "completions/mean_terminated_length": 461.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.12266666666666666, "grad_norm": 0.03011750802397728, "kl": 0.1486053466796875, "learning_rate": 2.361111111111111e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03672679141163826, "mask/share_reasoning": 0.8209772109985352, "mask/share_step_conf": 0.1383897066116333, "num_tokens": 27258084.0, "reward": 0.5477375984191895, "reward_std": 0.267084538936615, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6267191171646118, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.1804749071598053, "step": 115 }, { "adv/mean_abs_final_conf": 0.6818970441818237, "adv/mean_abs_reasoning": 0.53069007396698, "adv/mean_abs_step_conf": 0.6370528936386108, "adv/ratio_final_to_reasoning": 1.284925190110588, "adv/ratio_step_to_reasoning": 1.200423608598055, "adv/std_final_conf": 0.8758549690246582, "adv/std_reasoning": 0.8099659085273743, "adv/std_step_conf": 0.8759077787399292, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7652079664002167, "calib/avg_num_step_conf": 6.6953125, "calib/ece": 0.23586831275720166, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.49794238683127573, "calib/gap": 0.43501300636770085, "calib/mean_conf": 0.5828148148148148, "calib/mu_c": 0.7994262295081968, "calib/mu_w": 0.3644132231404959, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15831275720164611, "calib/std_conf": 0.45523263472062026, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.42356885721340154, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.03861009432680357, "calib/step_q_w": 0.38495876288659797, "calib/step_q_w_n": 970.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 546.57421875, "completions/mean_terminated_length": 550.8779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.12373333333333333, "grad_norm": 0.038956981152296066, "kl": 0.12445068359375, "learning_rate": 2.3333333333333336e-06, "loss": -0.0345, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03369300439953804, "mask/share_reasoning": 0.8271446228027344, "mask/share_step_conf": 0.13134989142417908, "num_tokens": 27502527.0, "reward": 0.6172642707824707, "reward_std": 0.2513754069805145, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7094132900238037, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.241521418094635, "step": 116 }, { "adv/mean_abs_final_conf": 0.7383880615234375, "adv/mean_abs_reasoning": 0.6252665519714355, "adv/mean_abs_step_conf": 0.685823917388916, "adv/ratio_final_to_reasoning": 1.1809172571207196, "adv/ratio_step_to_reasoning": 1.0968504795699465, "adv/std_final_conf": 0.9059211611747742, "adv/std_reasoning": 0.826666533946991, "adv/std_step_conf": 0.8762103915214539, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7002794005588011, "calib/avg_num_step_conf": 6.578125, "calib/ece": 0.3207968127490041, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5657370517928287, "calib/gap": 0.29413639827279664, "calib/mean_conf": 0.6429482071713147, "calib/mu_c": 0.7917741935483872, "calib/mu_w": 0.49763779527559054, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2348605577689244, "calib/std_conf": 0.4411711068561428, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44174929577464794, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.024497800758036647, "calib/step_q_w": 0.4172514950166113, "calib/step_q_w_n": 903.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 508.08203125, "completions/mean_terminated_length": 508.08203125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1248, "grad_norm": 0.02977118082344532, "kl": 0.13299560546875, "learning_rate": 2.305555555555556e-06, "loss": 0.0572, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033809371292591095, "mask/share_reasoning": 0.831067681312561, "mask/share_step_conf": 0.13512293994426727, "num_tokens": 27739196.0, "reward": 0.6096928715705872, "reward_std": 0.29228848218917847, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6669859290122986, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.25943103432655334, "step": 117 }, { "adv/mean_abs_final_conf": 0.6428592801094055, "adv/mean_abs_reasoning": 0.46186357736587524, "adv/mean_abs_step_conf": 0.6033726334571838, "adv/ratio_final_to_reasoning": 1.3918813078437458, "adv/ratio_step_to_reasoning": 1.3063871303694707, "adv/std_final_conf": 0.8607125878334045, "adv/std_reasoning": 0.7206830382347107, "adv/std_step_conf": 0.8277450203895569, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7611044015029522, "calib/avg_num_step_conf": 8.36328125, "calib/ece": 0.2409349593495935, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6585365853658537, "calib/gap": 0.3940217391304349, "calib/mean_conf": 0.736869918699187, "calib/mu_c": 0.9098550724637682, "calib/mu_w": 0.5158333333333333, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20841463414634148, "calib/std_conf": 0.4029953389822974, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.417522022022022, "calib/step_q_c_n": 999.0, "calib/step_q_gap": 0.11228331799400099, "calib/step_q_w": 0.305238704028021, "calib/step_q_w_n": 1142.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 572.18359375, "completions/mean_terminated_length": 576.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.12586666666666665, "grad_norm": 0.02147725597023964, "kl": 0.1240997314453125, "learning_rate": 2.277777777777778e-06, "loss": 0.051, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.032034147530794144, "mask/share_reasoning": 0.8106788396835327, "mask/share_step_conf": 0.14947453141212463, "num_tokens": 27989683.0, "reward": 0.6253924369812012, "reward_std": 0.25418126583099365, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7210777401924133, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.23048833012580872, "step": 118 }, { "adv/mean_abs_final_conf": 0.6779756546020508, "adv/mean_abs_reasoning": 0.5276739597320557, "adv/mean_abs_step_conf": 0.6841847896575928, "adv/ratio_final_to_reasoning": 1.2848381886161597, "adv/ratio_step_to_reasoning": 1.2966051802234295, "adv/std_final_conf": 0.8576713800430298, "adv/std_reasoning": 0.7929013967514038, "adv/std_step_conf": 0.8913794159889221, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7483030225409836, "calib/avg_num_step_conf": 6.96875, "calib/ece": 0.24463999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.568, "calib/gap": 0.4355776127049179, "calib/mean_conf": 0.62736, "calib/mu_c": 0.8399218749999999, "calib/mu_w": 0.404344262295082, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1799999999999999, "calib/std_conf": 0.4554841714044518, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4823654618473895, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.11810969664822807, "calib/step_q_w": 0.3642557651991614, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 554.29296875, "completions/mean_terminated_length": 556.4666748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.12693333333333334, "grad_norm": 0.029179885983467102, "kl": 0.1297149658203125, "learning_rate": 2.25e-06, "loss": -0.0681, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030885759741067886, "mask/share_reasoning": 0.8343116044998169, "mask/share_step_conf": 0.1308964192867279, "num_tokens": 28236646.0, "reward": 0.6131465435028076, "reward_std": 0.28527575731277466, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7247437238693237, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.20858052372932434, "step": 119 }, { "adv/mean_abs_final_conf": 0.6224154829978943, "adv/mean_abs_reasoning": 0.49500298500061035, "adv/mean_abs_step_conf": 0.6371960639953613, "adv/ratio_final_to_reasoning": 1.2573974336682572, "adv/ratio_step_to_reasoning": 1.28725701319675, "adv/std_final_conf": 0.8365208506584167, "adv/std_reasoning": 0.7575954794883728, "adv/std_step_conf": 0.8441767692565918, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7945937090432502, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.1946987951807229, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.606425702811245, "calib/gap": 0.5315203145478375, "calib/mean_conf": 0.6487550200803213, "calib/mu_c": 0.8814285714285715, "calib/mu_w": 0.34990825688073396, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14060240963855422, "calib/std_conf": 0.45502157615211547, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4697817229336438, "calib/step_q_c_n": 859.0, "calib/step_q_gap": 0.08280438582316219, "calib/step_q_w": 0.3869773371104816, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 483.97265625, "completions/mean_terminated_length": 485.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 73.0, "epoch": 0.128, "grad_norm": 0.018994498997926712, "kl": 0.139862060546875, "learning_rate": 2.222222222222222e-06, "loss": -0.0355, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033010274171829224, "mask/share_reasoning": 0.8301706314086914, "mask/share_step_conf": 0.13291281461715698, "num_tokens": 28467231.0, "reward": 0.6592800617218018, "reward_std": 0.25837767124176025, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7789312601089478, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.23650383949279785, "step": 120 }, { "adv/mean_abs_final_conf": 0.7655328512191772, "adv/mean_abs_reasoning": 0.6771527528762817, "adv/mean_abs_step_conf": 0.7010661363601685, "adv/ratio_final_to_reasoning": 1.1305172252013909, "adv/ratio_step_to_reasoning": 1.0353146072024546, "adv/std_final_conf": 0.9147785305976868, "adv/std_reasoning": 0.8590930104255676, "adv/std_step_conf": 0.8915013670921326, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6999674479166667, "calib/avg_num_step_conf": 7.453125, "calib/ece": 0.2822177419354839, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5443548387096774, "calib/gap": 0.34612499999999996, "calib/mean_conf": 0.6318951612903225, "calib/mu_c": 0.799375, "calib/mu_w": 0.45325, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19899193548387098, "calib/std_conf": 0.4440167157298862, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4470113953488372, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.05633754038700517, "calib/step_q_w": 0.39067385496183205, "calib/step_q_w_n": 1048.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 593.07421875, "completions/mean_terminated_length": 600.1067504882812, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.12906666666666666, "grad_norm": 0.028417551890015602, "kl": 0.12103271484375, "learning_rate": 2.1944444444444445e-06, "loss": 0.0138, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02966964617371559, "mask/share_reasoning": 0.8315616250038147, "mask/share_step_conf": 0.12704996764659882, "num_tokens": 28724114.0, "reward": 0.5897777080535889, "reward_std": 0.29276806116104126, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6864152550697327, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.20095272362232208, "step": 121 }, { "adv/mean_abs_final_conf": 0.647843599319458, "adv/mean_abs_reasoning": 0.4866180121898651, "adv/mean_abs_step_conf": 0.6225732564926147, "adv/ratio_final_to_reasoning": 1.3313185765649116, "adv/ratio_step_to_reasoning": 1.2793880228373125, "adv/std_final_conf": 0.8507066965103149, "adv/std_reasoning": 0.7392822504043579, "adv/std_step_conf": 0.8442066311836243, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7967769632124695, "calib/avg_num_step_conf": 6.78515625, "calib/ece": 0.19279999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.636, "calib/gap": 0.5194901261475464, "calib/mean_conf": 0.6808, "calib/mu_c": 0.894829931972789, "calib/mu_w": 0.3753398058252427, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14279999999999995, "calib/std_conf": 0.44115865626778766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49311001778198654, "calib/step_q_c_n": 960.0, "calib/step_q_gap": 0.13423987621184502, "calib/step_q_w": 0.3588701415701415, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 520.67578125, "completions/mean_terminated_length": 522.7176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.13013333333333332, "grad_norm": 0.020348021760582924, "kl": 0.1356964111328125, "learning_rate": 2.166666666666667e-06, "loss": 0.0589, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03320601209998131, "mask/share_reasoning": 0.8267264366149902, "mask/share_step_conf": 0.13616132736206055, "num_tokens": 28964751.0, "reward": 0.6960573792457581, "reward_std": 0.2487361878156662, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7873148322105408, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2946436405181885, "step": 122 }, { "adv/mean_abs_final_conf": 0.7347197532653809, "adv/mean_abs_reasoning": 0.6672190427780151, "adv/mean_abs_step_conf": 0.6992411613464355, "adv/ratio_final_to_reasoning": 1.1011672421793024, "adv/ratio_step_to_reasoning": 1.0479934122310028, "adv/std_final_conf": 0.9062029123306274, "adv/std_reasoning": 0.8903211951255798, "adv/std_step_conf": 0.9068571925163269, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7280187163908094, "calib/avg_num_step_conf": 7.8671875, "calib/ece": 0.2968749999999999, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5916666666666667, "calib/gap": 0.3397381102032265, "calib/mean_conf": 0.668375, "calib/mu_c": 0.8255038759689922, "calib/mu_w": 0.48576576576576574, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.21387499999999993, "calib/std_conf": 0.4370891702025267, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4385541905855339, "calib/step_q_c_n": 871.0, "calib/step_q_gap": 0.07810454084823093, "calib/step_q_w": 0.360449649737303, "calib/step_q_w_n": 1142.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2348.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 592.859375, "completions/mean_terminated_length": 602.2698974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1312, "grad_norm": 0.020996691659092903, "kl": 0.1155853271484375, "learning_rate": 2.138888888888889e-06, "loss": -0.1386, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.029429558664560318, "mask/share_reasoning": 0.825176477432251, "mask/share_step_conf": 0.12976893782615662, "num_tokens": 29221811.0, "reward": 0.5914741158485413, "reward_std": 0.3070160746574402, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6676355600357056, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.227812722325325, "step": 123 }, { "adv/mean_abs_final_conf": 0.6610862016677856, "adv/mean_abs_reasoning": 0.45662179589271545, "adv/mean_abs_step_conf": 0.7047810554504395, "adv/ratio_final_to_reasoning": 1.4477762726488634, "adv/ratio_step_to_reasoning": 1.5434678365989118, "adv/std_final_conf": 0.8762552738189697, "adv/std_reasoning": 0.7391733527183533, "adv/std_step_conf": 0.9068486094474792, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8187683284457479, "calib/avg_num_step_conf": 7.07421875, "calib/ece": 0.1515354330708661, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6062992125984252, "calib/gap": 0.5310518084066473, "calib/mean_conf": 0.6843700787401574, "calib/mu_c": 0.8913548387096776, "calib/mu_w": 0.36030303030303024, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11283464566929129, "calib/std_conf": 0.42829109798889026, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46175232358003443, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.021826283518401213, "calib/step_q_w": 0.4399260400616332, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 526.3125, "completions/mean_terminated_length": 528.3765258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.13226666666666667, "grad_norm": 0.027078785002231598, "kl": 0.13543701171875, "learning_rate": 2.1111111111111114e-06, "loss": 0.0024, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03057037852704525, "mask/share_reasoning": 0.8252548575401306, "mask/share_step_conf": 0.1402685046195984, "num_tokens": 29463363.0, "reward": 0.725241482257843, "reward_std": 0.2493542730808258, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.8193902373313904, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.3115614652633667, "step": 124 }, { "adv/mean_abs_final_conf": 0.6498396396636963, "adv/mean_abs_reasoning": 0.6036573052406311, "adv/mean_abs_step_conf": 0.6918566823005676, "adv/ratio_final_to_reasoning": 1.0765042251988584, "adv/ratio_step_to_reasoning": 1.1461083570002988, "adv/std_final_conf": 0.8475610613822937, "adv/std_reasoning": 0.8266199827194214, "adv/std_step_conf": 0.8916298747062683, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6359220218440438, "calib/avg_num_step_conf": 6.703125, "calib/ece": 0.3336254980079681, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5219123505976095, "calib/gap": 0.24020510541021084, "calib/mean_conf": 0.6195219123505976, "calib/mu_c": 0.7381889763779528, "calib/mu_w": 0.49798387096774194, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2235856573705179, "calib/std_conf": 0.44373165246613516, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4449457177322075, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.031095376018565934, "calib/step_q_w": 0.41385034171364155, "calib/step_q_w_n": 887.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 575.50390625, "completions/mean_terminated_length": 575.50390625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.13333333333333333, "grad_norm": 0.024703064933419228, "kl": 0.13128662109375, "learning_rate": 2.0833333333333334e-06, "loss": 0.023, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03389430046081543, "mask/share_reasoning": 0.8332763910293579, "mask/share_step_conf": 0.13282933831214905, "num_tokens": 29715500.0, "reward": 0.5418754816055298, "reward_std": 0.2697746455669403, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6435273885726929, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.14569228887557983, "step": 125 }, { "adv/mean_abs_final_conf": 0.6910170316696167, "adv/mean_abs_reasoning": 0.560282289981842, "adv/mean_abs_step_conf": 0.633554220199585, "adv/ratio_final_to_reasoning": 1.233337273059285, "adv/ratio_step_to_reasoning": 1.1307768093475121, "adv/std_final_conf": 0.8788483142852783, "adv/std_reasoning": 0.8265383839607239, "adv/std_step_conf": 0.8602001667022705, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7118435083977606, "calib/avg_num_step_conf": 6.57421875, "calib/ece": 0.3139591836734694, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6122448979591837, "calib/gap": 0.3155765129298853, "calib/mean_conf": 0.6692244897959183, "calib/mu_c": 0.8250806451612903, "calib/mu_w": 0.509504132231405, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.23853061224489802, "calib/std_conf": 0.44178835385773985, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.46124213333333336, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.07832466280814582, "calib/step_q_w": 0.38291747052518754, "calib/step_q_w_n": 933.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 542.1640625, "completions/mean_terminated_length": 542.1640625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1344, "grad_norm": 0.02971208468079567, "kl": 0.11859130859375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0043, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03369639068841934, "mask/share_reasoning": 0.8317639827728271, "mask/share_step_conf": 0.13453960418701172, "num_tokens": 29959758.0, "reward": 0.5545740723609924, "reward_std": 0.2554197907447815, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6497457027435303, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.1719023734331131, "step": 126 }, { "adv/mean_abs_final_conf": 0.668084979057312, "adv/mean_abs_reasoning": 0.49337756633758545, "adv/mean_abs_step_conf": 0.6550658941268921, "adv/ratio_final_to_reasoning": 1.3541048978302874, "adv/ratio_step_to_reasoning": 1.327717226767206, "adv/std_final_conf": 0.8700375556945801, "adv/std_reasoning": 0.775467038154602, "adv/std_step_conf": 0.8760483860969543, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7279494474955337, "calib/avg_num_step_conf": 7.2109375, "calib/ece": 0.2797560975609756, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6138211382113821, "calib/gap": 0.4013471845431086, "calib/mean_conf": 0.6791869918699186, "calib/mu_c": 0.8863865546218487, "calib/mu_w": 0.4850393700787401, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23760162601626014, "calib/std_conf": 0.44424521445086596, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4592928660826032, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.07609821469769396, "calib/step_q_w": 0.38319465138490927, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 531.3515625, "completions/mean_terminated_length": 533.435302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.13546666666666668, "grad_norm": 0.049308981746435165, "kl": 0.1278839111328125, "learning_rate": 2.027777777777778e-06, "loss": 0.1015, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03336185961961746, "mask/share_reasoning": 0.815756618976593, "mask/share_step_conf": 0.14697524905204773, "num_tokens": 30199456.0, "reward": 0.5791636109352112, "reward_std": 0.26674067974090576, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.687235951423645, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.18593505024909973, "step": 127 }, { "adv/mean_abs_final_conf": 0.6516941785812378, "adv/mean_abs_reasoning": 0.5147262811660767, "adv/mean_abs_step_conf": 0.5420503616333008, "adv/ratio_final_to_reasoning": 1.266098511824323, "adv/ratio_step_to_reasoning": 1.053084681056742, "adv/std_final_conf": 0.8475783467292786, "adv/std_reasoning": 0.7755802273750305, "adv/std_step_conf": 0.8110368251800537, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6809375423384365, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.3188888888888889, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6008230452674898, "calib/gap": 0.32248137108792846, "calib/mean_conf": 0.6491769547325102, "calib/mu_c": 0.8097540983606557, "calib/mu_w": 0.48727272727272725, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23300411522633743, "calib/std_conf": 0.45567313474030885, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46488379204892966, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.045876942733861226, "calib/step_q_w": 0.41900684931506843, "calib/step_q_w_n": 876.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 508.828125, "completions/mean_terminated_length": 516.90478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.13653333333333334, "grad_norm": 0.03492178022861481, "kl": 0.134613037109375, "learning_rate": 2.0000000000000003e-06, "loss": -0.0242, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.035815007984638214, "mask/share_reasoning": 0.8240078687667847, "mask/share_step_conf": 0.1245521605014801, "num_tokens": 30436380.0, "reward": 0.5387958288192749, "reward_std": 0.2709641456604004, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6398922204971313, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.15410563349723816, "step": 128 }, { "adv/mean_abs_final_conf": 0.6473082304000854, "adv/mean_abs_reasoning": 0.46903902292251587, "adv/mean_abs_step_conf": 0.6198506951332092, "adv/ratio_final_to_reasoning": 1.3800732961765085, "adv/ratio_step_to_reasoning": 1.3215333156525169, "adv/std_final_conf": 0.8524180054664612, "adv/std_reasoning": 0.7393319010734558, "adv/std_step_conf": 0.8440448641777039, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7160808890037472, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.2629083665338645, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5737051792828686, "calib/gap": 0.3410111125468407, "calib/mean_conf": 0.6609960159362549, "calib/mu_c": 0.8090845070422535, "calib/mu_w": 0.46807339449541285, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17908366533864542, "calib/std_conf": 0.4355573020059942, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48249905660377357, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.07723985205831901, "calib/step_q_w": 0.40525920454545455, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 507.01953125, "completions/mean_terminated_length": 509.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 79.0, "epoch": 0.1376, "grad_norm": 0.03975765407085419, "kl": 0.1287384033203125, "learning_rate": 1.9722222222222224e-06, "loss": -0.0231, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036101073026657104, "mask/share_reasoning": 0.8174564838409424, "mask/share_step_conf": 0.14253617823123932, "num_tokens": 30668561.0, "reward": 0.6561309695243835, "reward_std": 0.2599911689758301, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.701225757598877, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.30556732416152954, "step": 129 }, { "adv/mean_abs_final_conf": 0.6017012596130371, "adv/mean_abs_reasoning": 0.4371397793292999, "adv/mean_abs_step_conf": 0.5608911514282227, "adv/ratio_final_to_reasoning": 1.3764504812081448, "adv/ratio_step_to_reasoning": 1.2830933672721194, "adv/std_final_conf": 0.8456298112869263, "adv/std_reasoning": 0.7206131815910339, "adv/std_step_conf": 0.8109444975852966, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8069272499652245, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.1953815261044176, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.5153602726387536, "calib/mean_conf": 0.6685542168674697, "calib/mu_c": 0.8568987341772151, "calib/mu_w": 0.3415384615384615, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11469879518072285, "calib/std_conf": 0.45158746998169363, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4840122807017544, "calib/step_q_c_n": 912.0, "calib/step_q_gap": -0.009379023646071794, "calib/step_q_w": 0.4933913043478262, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 453.984375, "completions/mean_terminated_length": 457.5590515136719, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.13866666666666666, "grad_norm": 0.030927494168281555, "kl": 0.1439208984375, "learning_rate": 1.944444444444445e-06, "loss": -0.0123, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.036079712212085724, "mask/share_reasoning": 0.8173176050186157, "mask/share_step_conf": 0.13879013061523438, "num_tokens": 30890069.0, "reward": 0.6485151052474976, "reward_std": 0.2379131019115448, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7801050543785095, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.1989564597606659, "step": 130 }, { "adv/mean_abs_final_conf": 0.6517130136489868, "adv/mean_abs_reasoning": 0.359472393989563, "adv/mean_abs_step_conf": 0.5277432203292847, "adv/ratio_final_to_reasoning": 1.8129709667438574, "adv/ratio_step_to_reasoning": 1.468105003758946, "adv/std_final_conf": 0.8616915345191956, "adv/std_reasoning": 0.6613028645515442, "adv/std_step_conf": 0.7756242752075195, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.760390552843383, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.24461847389558233, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.41767068273092367, "calib/gap": 0.4423901570127985, "calib/mean_conf": 0.523012048192771, "calib/mu_c": 0.7770754716981132, "calib/mu_w": 0.3346853146853147, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17096385542168674, "calib/std_conf": 0.4544206822772782, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.463901873935264, "calib/step_q_c_n": 587.0, "calib/step_q_gap": 0.0893587386893624, "calib/step_q_w": 0.3745431352459016, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 479.13671875, "completions/mean_terminated_length": 482.9094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.13973333333333332, "grad_norm": 0.02649601548910141, "kl": 0.1383514404296875, "learning_rate": 1.916666666666667e-06, "loss": 0.0021, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03474812209606171, "mask/share_reasoning": 0.8239185810089111, "mask/share_step_conf": 0.13352076709270477, "num_tokens": 31118936.0, "reward": 0.6090091466903687, "reward_std": 0.21556419134140015, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.7330429553985596, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.20841291546821594, "step": 131 }, { "adv/mean_abs_final_conf": 0.6536787152290344, "adv/mean_abs_reasoning": 0.5923720002174377, "adv/mean_abs_step_conf": 0.6798343062400818, "adv/ratio_final_to_reasoning": 1.1034936070393153, "adv/ratio_step_to_reasoning": 1.1476476031793195, "adv/std_final_conf": 0.8471890091896057, "adv/std_reasoning": 0.8100506663322449, "adv/std_step_conf": 0.8759794235229492, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7709086043088451, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.2450202429149797, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5708502024291497, "calib/gap": 0.4534577813461797, "calib/mean_conf": 0.631497975708502, "calib/mu_c": 0.8260992907801419, "calib/mu_w": 0.3726415094339622, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15283400809716596, "calib/std_conf": 0.459424678724384, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44621748768472896, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.082739797396015, "calib/step_q_w": 0.36347769028871396, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2170.0, "completions/max_terminated_length": 2170.0, "completions/mean_length": 499.90625, "completions/mean_terminated_length": 501.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1408, "grad_norm": 0.02676284685730934, "kl": 0.128265380859375, "learning_rate": 1.888888888888889e-06, "loss": 0.0151, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03550625964999199, "mask/share_reasoning": 0.8245701193809509, "mask/share_step_conf": 0.13601738214492798, "num_tokens": 31352504.0, "reward": 0.6413894891738892, "reward_std": 0.2601076364517212, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7356421947479248, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.2440117746591568, "step": 132 }, { "adv/mean_abs_final_conf": 0.7299935817718506, "adv/mean_abs_reasoning": 0.5942347049713135, "adv/mean_abs_step_conf": 0.7109254598617554, "adv/ratio_final_to_reasoning": 1.228460027098368, "adv/ratio_step_to_reasoning": 1.1963714907833851, "adv/std_final_conf": 0.8762795329093933, "adv/std_reasoning": 0.8266639709472656, "adv/std_step_conf": 0.8916401267051697, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6933445945945945, "calib/avg_num_step_conf": 7.0703125, "calib/ece": 0.32233870967741934, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.39919354838709675, "calib/gap": 0.3234216216216216, "calib/mean_conf": 0.4787903225806452, "calib/mu_c": 0.6718, "calib/mu_w": 0.3483783783783784, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1989516129032258, "calib/std_conf": 0.4670956811941409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4459450222882615, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.110076420705148, "calib/step_q_w": 0.3358686015831135, "calib/step_q_w_n": 1137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 571.6015625, "completions/mean_terminated_length": 573.8432006835938, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.14186666666666667, "grad_norm": 0.027795914560556412, "kl": 0.113494873046875, "learning_rate": 1.8611111111111113e-06, "loss": -0.038, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0295706819742918, "mask/share_reasoning": 0.8409591913223267, "mask/share_step_conf": 0.1255638748407364, "num_tokens": 31605178.0, "reward": 0.5707153081893921, "reward_std": 0.2753121852874756, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.669532060623169, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2000235617160797, "step": 133 }, { "adv/mean_abs_final_conf": 0.6756203770637512, "adv/mean_abs_reasoning": 0.6351618766784668, "adv/mean_abs_step_conf": 0.6974896788597107, "adv/ratio_final_to_reasoning": 1.0636979357087035, "adv/ratio_step_to_reasoning": 1.0981290037544171, "adv/std_final_conf": 0.8517118096351624, "adv/std_reasoning": 0.8431515097618103, "adv/std_step_conf": 0.8916091918945312, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7510065691883874, "calib/avg_num_step_conf": 6.90234375, "calib/ece": 0.25382352941176467, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.39915966386554624, "calib/gap": 0.4461220597584234, "calib/mean_conf": 0.46911764705882353, "calib/mu_c": 0.6884297520661157, "calib/mu_w": 0.24230769230769228, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10726890756302519, "calib/std_conf": 0.4703829778202386, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4298724489795918, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.1037989792363358, "calib/step_q_w": 0.326073469743256, "calib/step_q_w_n": 983.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 595.92578125, "completions/mean_terminated_length": 605.3849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.14293333333333333, "grad_norm": 0.0486784428358078, "kl": 0.1104888916015625, "learning_rate": 1.8333333333333333e-06, "loss": -0.0415, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.028609707951545715, "mask/share_reasoning": 0.8345072269439697, "mask/share_step_conf": 0.12125807255506516, "num_tokens": 31866687.0, "reward": 0.6060777902603149, "reward_std": 0.2680931091308594, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.693605899810791, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.23886226117610931, "step": 134 }, { "adv/mean_abs_final_conf": 0.6499534845352173, "adv/mean_abs_reasoning": 0.46348199248313904, "adv/mean_abs_step_conf": 0.744178056716919, "adv/ratio_final_to_reasoning": 1.4023273721014347, "adv/ratio_step_to_reasoning": 1.6056245308041635, "adv/std_final_conf": 0.8586642146110535, "adv/std_reasoning": 0.739395260810852, "adv/std_step_conf": 0.9218146800994873, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7360064935064936, "calib/avg_num_step_conf": 6.68359375, "calib/ece": 0.25564000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.456, "calib/gap": 0.4040519480519481, "calib/mean_conf": 0.5383600000000001, "calib/mu_c": 0.7161428571428572, "calib/mu_w": 0.3120909090909091, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11699999999999997, "calib/std_conf": 0.46451535001547584, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4249075089392133, "calib/step_q_c_n": 839.0, "calib/step_q_gap": 0.04308209036123162, "calib/step_q_w": 0.3818254185779817, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 520.26171875, "completions/mean_terminated_length": 524.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.144, "grad_norm": 0.05749247968196869, "kl": 0.1352386474609375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0214, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03426843136548996, "mask/share_reasoning": 0.8217822909355164, "mask/share_step_conf": 0.13613678514957428, "num_tokens": 32105754.0, "reward": 0.6669447422027588, "reward_std": 0.2399473935365677, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7192128896713257, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.30998915433883667, "step": 135 }, { "adv/mean_abs_final_conf": 0.6894182562828064, "adv/mean_abs_reasoning": 0.6038837432861328, "adv/mean_abs_step_conf": 0.5832561254501343, "adv/ratio_final_to_reasoning": 1.1416406948317956, "adv/ratio_step_to_reasoning": 0.9658417401274126, "adv/std_final_conf": 0.8895182609558105, "adv/std_reasoning": 0.8590108752250671, "adv/std_step_conf": 0.8441848754882812, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.828699945887446, "calib/avg_num_step_conf": 6.98828125, "calib/ece": 0.17840163934426229, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.3770491803278688, "calib/gap": 0.5560308441558443, "calib/mean_conf": 0.4525, "calib/mu_c": 0.7533035714285715, "calib/mu_w": 0.19727272727272727, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.08594262295081966, "calib/std_conf": 0.465212691530074, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4268848167539267, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.13105847529051207, "calib/step_q_w": 0.29582634146341463, "calib/step_q_w_n": 1025.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 529.38671875, "completions/mean_terminated_length": 531.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.14506666666666668, "grad_norm": 0.04339177906513214, "kl": 0.1258392333984375, "learning_rate": 1.777777777777778e-06, "loss": -0.0418, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03298787772655487, "mask/share_reasoning": 0.8215695023536682, "mask/share_step_conf": 0.14153635501861572, "num_tokens": 32349765.0, "reward": 0.6436047554016113, "reward_std": 0.2522132098674774, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.758393406867981, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.25381606817245483, "step": 136 }, { "adv/mean_abs_final_conf": 0.6921361684799194, "adv/mean_abs_reasoning": 0.4638771116733551, "adv/mean_abs_step_conf": 0.6849335432052612, "adv/ratio_final_to_reasoning": 1.4920679444242462, "adv/ratio_step_to_reasoning": 1.4765409328657841, "adv/std_final_conf": 0.8809284567832947, "adv/std_reasoning": 0.7393264174461365, "adv/std_step_conf": 0.8911353945732117, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7079516539440203, "calib/avg_num_step_conf": 7.34375, "calib/ece": 0.28537848605577687, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4541832669322709, "calib/gap": 0.3559198473282443, "calib/mean_conf": 0.5352589641434262, "calib/mu_c": 0.7054198473282443, "calib/mu_w": 0.3495, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1493625498007968, "calib/std_conf": 0.4609839561611346, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38678796821793415, "calib/step_q_c_n": 881.0, "calib/step_q_gap": 0.019320200450166414, "calib/step_q_w": 0.36746776776776774, "calib/step_q_w_n": 999.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 503.07421875, "completions/mean_terminated_length": 507.0354309082031, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.14613333333333334, "grad_norm": 0.026422398164868355, "kl": 0.129486083984375, "learning_rate": 1.75e-06, "loss": -0.0271, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03263844549655914, "mask/share_reasoning": 0.8191424608230591, "mask/share_step_conf": 0.14040660858154297, "num_tokens": 32585536.0, "reward": 0.6648370623588562, "reward_std": 0.23320212960243225, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7014410495758057, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.32979562878608704, "step": 137 }, { "adv/mean_abs_final_conf": 0.658505916595459, "adv/mean_abs_reasoning": 0.5958113074302673, "adv/mean_abs_step_conf": 0.7404865026473999, "adv/ratio_final_to_reasoning": 1.1052256115037382, "adv/ratio_step_to_reasoning": 1.242820492684364, "adv/std_final_conf": 0.8746519088745117, "adv/std_reasoning": 0.826693058013916, "adv/std_step_conf": 0.9218471050262451, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7793832363368788, "calib/avg_num_step_conf": 6.84765625, "calib/ece": 0.21891999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.48, "calib/gap": 0.49603251053582176, "calib/mean_conf": 0.5432400000000001, "calib/mu_c": 0.7396688741721854, "calib/mu_w": 0.24363636363636362, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07907999999999996, "calib/std_conf": 0.47327402464111634, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3979805615550756, "calib/step_q_c_n": 926.0, "calib/step_q_gap": 0.11956822781867898, "calib/step_q_w": 0.2784123337363966, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 516.484375, "completions/mean_terminated_length": 516.484375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1472, "grad_norm": 0.05227003991603851, "kl": 0.1251220703125, "learning_rate": 1.7222222222222224e-06, "loss": 0.0845, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03415646031498909, "mask/share_reasoning": 0.829301118850708, "mask/share_step_conf": 0.136542409658432, "num_tokens": 32822092.0, "reward": 0.6527807712554932, "reward_std": 0.2752731442451477, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.752365231513977, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.23991498351097107, "step": 138 }, { "adv/mean_abs_final_conf": 0.6724894046783447, "adv/mean_abs_reasoning": 0.5220587253570557, "adv/mean_abs_step_conf": 0.6653573513031006, "adv/ratio_final_to_reasoning": 1.2881489610549155, "adv/ratio_step_to_reasoning": 1.2744875604713581, "adv/std_final_conf": 0.8750696778297424, "adv/std_reasoning": 0.7754369974136353, "adv/std_step_conf": 0.8757834434509277, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7785954301075267, "calib/avg_num_step_conf": 5.91796875, "calib/ece": 0.2250996015936255, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.47410358565737054, "calib/gap": 0.4737291666666668, "calib/mean_conf": 0.556812749003984, "calib/mu_c": 0.7380000000000001, "calib/mu_w": 0.2642708333333333, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08219123505976099, "calib/std_conf": 0.4621871191028094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.409071871657754, "calib/step_q_c_n": 935.0, "calib/step_q_gap": 0.016632044071547114, "calib/step_q_w": 0.3924398275862069, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 449.234375, "completions/mean_terminated_length": 450.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.14826666666666666, "grad_norm": 0.03387359529733658, "kl": 0.13848876953125, "learning_rate": 1.6944444444444446e-06, "loss": -0.0612, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035630371421575546, "mask/share_reasoning": 0.8260866403579712, "mask/share_step_conf": 0.13437677919864655, "num_tokens": 33040192.0, "reward": 0.6981635689735413, "reward_std": 0.253256618976593, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7552422285079956, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.32389748096466064, "step": 139 }, { "adv/mean_abs_final_conf": 0.6112039685249329, "adv/mean_abs_reasoning": 0.4724005162715912, "adv/mean_abs_step_conf": 0.6800662875175476, "adv/ratio_final_to_reasoning": 1.2938257844187901, "adv/ratio_step_to_reasoning": 1.4395968338158331, "adv/std_final_conf": 0.8223353624343872, "adv/std_reasoning": 0.7206501364707947, "adv/std_step_conf": 0.8915489315986633, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7473584108199492, "calib/avg_num_step_conf": 6.24609375, "calib/ece": 0.22355731225296432, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6640316205533597, "calib/gap": 0.39697449985911526, "calib/mean_conf": 0.7264822134387352, "calib/mu_c": 0.8582840236686391, "calib/mu_w": 0.46130952380952384, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14102766798418964, "calib/std_conf": 0.4179962042396283, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.44277988614800756, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.041146675138833244, "calib/step_q_w": 0.4016332110091743, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 504.875, "completions/mean_terminated_length": 506.85491943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.14933333333333335, "grad_norm": 0.03778177872300148, "kl": 0.1361083984375, "learning_rate": 1.6666666666666667e-06, "loss": 0.0494, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03563731163740158, "mask/share_reasoning": 0.8246203660964966, "mask/share_step_conf": 0.13583609461784363, "num_tokens": 33274456.0, "reward": 0.701540470123291, "reward_std": 0.26073405146598816, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7632448673248291, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.31092965602874756, "step": 140 }, { "adv/mean_abs_final_conf": 0.5802136659622192, "adv/mean_abs_reasoning": 0.4407306909561157, "adv/mean_abs_step_conf": 0.6841034293174744, "adv/ratio_final_to_reasoning": 1.3164811933190104, "adv/ratio_step_to_reasoning": 1.552202838049215, "adv/std_final_conf": 0.8013152480125427, "adv/std_reasoning": 0.7205579876899719, "adv/std_step_conf": 0.87605220079422, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8563920454545455, "calib/avg_num_step_conf": 6.546875, "calib/ece": 0.14665322580645154, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6330645161290323, "calib/gap": 0.5917727272727273, "calib/mean_conf": 0.703266129032258, "calib/mu_c": 0.91325, "calib/mu_w": 0.3214772727272727, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10237903225806445, "calib/std_conf": 0.42918271184086637, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.42003910840932124, "calib/step_q_c_n": 987.0, "calib/step_q_gap": 0.12816682974458976, "calib/step_q_w": 0.2918722786647315, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 562.0078125, "completions/mean_terminated_length": 562.0078125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1504, "grad_norm": 0.02137957699596882, "kl": 0.11334228515625, "learning_rate": 1.638888888888889e-06, "loss": 0.0788, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031555481255054474, "mask/share_reasoning": 0.842831015586853, "mask/share_step_conf": 0.1256134808063507, "num_tokens": 33525426.0, "reward": 0.703129768371582, "reward_std": 0.22637996077537537, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.8222468495368958, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.266044020652771, "step": 141 }, { "adv/mean_abs_final_conf": 0.6251518726348877, "adv/mean_abs_reasoning": 0.4635844826698303, "adv/mean_abs_step_conf": 0.5964958667755127, "adv/ratio_final_to_reasoning": 1.3485176834103123, "adv/ratio_step_to_reasoning": 1.2867036949559487, "adv/std_final_conf": 0.8468737006187439, "adv/std_reasoning": 0.7206167578697205, "adv/std_step_conf": 0.8278431296348572, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7952395964691047, "calib/avg_num_step_conf": 7.12890625, "calib/ece": 0.2280436507936508, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5753968253968254, "calib/gap": 0.4495239596469105, "calib/mean_conf": 0.6537579365079366, "calib/mu_c": 0.8713846153846154, "calib/mu_w": 0.4218606557377049, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18296428571428572, "calib/std_conf": 0.4389258634752434, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4350495689655172, "calib/step_q_c_n": 928.0, "calib/step_q_gap": 0.10051222225425749, "calib/step_q_w": 0.33453734671125973, "calib/step_q_w_n": 897.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 540.765625, "completions/mean_terminated_length": 542.8862915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.15146666666666667, "grad_norm": 0.024171728640794754, "kl": 0.129547119140625, "learning_rate": 1.6111111111111113e-06, "loss": 0.022, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032716378569602966, "mask/share_reasoning": 0.8250454664230347, "mask/share_step_conf": 0.13833190500736237, "num_tokens": 33769022.0, "reward": 0.6451792120933533, "reward_std": 0.2571515440940857, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.75119549036026, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.24072539806365967, "step": 142 }, { "adv/mean_abs_final_conf": 0.6038858294487, "adv/mean_abs_reasoning": 0.4694972336292267, "adv/mean_abs_step_conf": 0.5874793529510498, "adv/ratio_final_to_reasoning": 1.2862393773455185, "adv/ratio_step_to_reasoning": 1.2512945995651945, "adv/std_final_conf": 0.8353819251060486, "adv/std_reasoning": 0.7392933964729309, "adv/std_step_conf": 0.8440662622451782, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7206875993640699, "calib/avg_num_step_conf": 7.02734375, "calib/ece": 0.2959935222672065, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.680161943319838, "calib/gap": 0.3548519872813991, "calib/mean_conf": 0.7045732793522268, "calib/mu_c": 0.8640411764705883, "calib/mu_w": 0.5091891891891892, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22497975708502027, "calib/std_conf": 0.4418696667151187, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3864024096385542, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.10418259539706815, "calib/step_q_w": 0.28221981424148607, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 552.2890625, "completions/mean_terminated_length": 554.4549560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.15253333333333333, "grad_norm": 0.03752093389630318, "kl": 0.11902618408203125, "learning_rate": 1.5833333333333333e-06, "loss": 0.0082, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031237218528985977, "mask/share_reasoning": 0.8308833837509155, "mask/share_step_conf": 0.1339731514453888, "num_tokens": 34017744.0, "reward": 0.6695082187652588, "reward_std": 0.248559832572937, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6803755760192871, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.3602033853530884, "step": 143 }, { "adv/mean_abs_final_conf": 0.6100111603736877, "adv/mean_abs_reasoning": 0.5660476684570312, "adv/mean_abs_step_conf": 0.6275326609611511, "adv/ratio_final_to_reasoning": 1.0776674728411035, "adv/ratio_step_to_reasoning": 1.1086215807084225, "adv/std_final_conf": 0.8367601037025452, "adv/std_reasoning": 0.8098620176315308, "adv/std_step_conf": 0.8442287445068359, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7195312500000001, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.2639285714285714, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.623015873015873, "calib/gap": 0.378171195652174, "calib/mean_conf": 0.6675, "calib/mu_c": 0.8055625000000001, "calib/mu_w": 0.4273913043478261, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14825396825396828, "calib/std_conf": 0.4542110070098355, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.41705943579766536, "calib/step_q_c_n": 1028.0, "calib/step_q_gap": 0.0800904092489928, "calib/step_q_w": 0.33696902654867256, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 503.96484375, "completions/mean_terminated_length": 505.9411926269531, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.1536, "grad_norm": 0.0276224035769701, "kl": 0.1339874267578125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0009, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034667547792196274, "mask/share_reasoning": 0.827467143535614, "mask/share_step_conf": 0.13395905494689941, "num_tokens": 34250887.0, "reward": 0.6699892282485962, "reward_std": 0.25267669558525085, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7207433581352234, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.29814136028289795, "step": 144 }, { "adv/mean_abs_final_conf": 0.6298481225967407, "adv/mean_abs_reasoning": 0.5598827600479126, "adv/mean_abs_step_conf": 0.6489096879959106, "adv/ratio_final_to_reasoning": 1.1249643095687403, "adv/ratio_step_to_reasoning": 1.1590099469045616, "adv/std_final_conf": 0.8393099904060364, "adv/std_reasoning": 0.7756425142288208, "adv/std_step_conf": 0.860239565372467, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.72692334814188, "calib/avg_num_step_conf": 7.2421875, "calib/ece": 0.23447999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.724, "calib/gap": 0.31966998096043975, "calib/mean_conf": 0.7980799999999999, "calib/mu_c": 0.9093251533742329, "calib/mu_w": 0.5896551724137932, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19027999999999995, "calib/std_conf": 0.3684023800140276, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3951980113636364, "calib/step_q_c_n": 1056.0, "calib/step_q_gap": 0.046954598565643935, "calib/step_q_w": 0.3482434127979925, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 521.74609375, "completions/mean_terminated_length": 523.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.15466666666666667, "grad_norm": 0.022892968729138374, "kl": 0.121978759765625, "learning_rate": 1.527777777777778e-06, "loss": 0.0565, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03505028784275055, "mask/share_reasoning": 0.8147192001342773, "mask/share_step_conf": 0.1463243067264557, "num_tokens": 34487158.0, "reward": 0.6805267930030823, "reward_std": 0.2644423842430115, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7355449199676514, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.3051961064338684, "step": 145 }, { "adv/mean_abs_final_conf": 0.6573969125747681, "adv/mean_abs_reasoning": 0.5301852226257324, "adv/mean_abs_step_conf": 0.6028902530670166, "adv/ratio_final_to_reasoning": 1.2399382037074178, "adv/ratio_step_to_reasoning": 1.1371313785043156, "adv/std_final_conf": 0.8653373122215271, "adv/std_reasoning": 0.7755250334739685, "adv/std_step_conf": 0.8441872596740723, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7015346797767739, "calib/avg_num_step_conf": 6.94140625, "calib/ece": 0.3297564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6532258064516129, "calib/gap": 0.3626466648950306, "calib/mean_conf": 0.6966951612903225, "calib/mu_c": 0.9043396226415095, "calib/mu_w": 0.5416929577464789, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2995161290322581, "calib/std_conf": 0.43562126341338897, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36159009762900973, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.025999634928348725, "calib/step_q_w": 0.335590462700661, "calib/step_q_w_n": 1059.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 558.21484375, "completions/mean_terminated_length": 560.4039306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.15573333333333333, "grad_norm": 0.04226524382829666, "kl": 0.116302490234375, "learning_rate": 1.5e-06, "loss": 0.0554, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03268972411751747, "mask/share_reasoning": 0.8304544687271118, "mask/share_step_conf": 0.13294953107833862, "num_tokens": 34737277.0, "reward": 0.5627621412277222, "reward_std": 0.29553359746932983, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.6492359638214111, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.2005070298910141, "step": 146 }, { "adv/mean_abs_final_conf": 0.6402840614318848, "adv/mean_abs_reasoning": 0.4667801558971405, "adv/mean_abs_step_conf": 0.5430867075920105, "adv/ratio_final_to_reasoning": 1.3717036882197227, "adv/ratio_step_to_reasoning": 1.1634742838375607, "adv/std_final_conf": 0.8318468928337097, "adv/std_reasoning": 0.7392444014549255, "adv/std_step_conf": 0.7939841151237488, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7006680369989722, "calib/avg_num_step_conf": 7.5625, "calib/ece": 0.3455378486055776, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6653386454183267, "calib/gap": 0.33852453751284695, "calib/mean_conf": 0.7201195219123506, "calib/mu_c": 0.9075892857142858, "calib/mu_w": 0.5690647482014388, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30972111553784853, "calib/std_conf": 0.4287705783074233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3855332070707071, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.027485350552771848, "calib/step_q_w": 0.35804785651793525, "calib/step_q_w_n": 1143.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2011.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 548.55078125, "completions/mean_terminated_length": 552.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.1568, "grad_norm": 0.0274009071290493, "kl": 0.118560791015625, "learning_rate": 1.4722222222222225e-06, "loss": -0.0147, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031080050393939018, "mask/share_reasoning": 0.8226253986358643, "mask/share_step_conf": 0.1384820193052292, "num_tokens": 34981386.0, "reward": 0.5492904186248779, "reward_std": 0.27415531873703003, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6445058584213257, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.1712624877691269, "step": 147 }, { "adv/mean_abs_final_conf": 0.5738561153411865, "adv/mean_abs_reasoning": 0.4341607093811035, "adv/mean_abs_step_conf": 0.644412636756897, "adv/ratio_final_to_reasoning": 1.3217596685780686, "adv/ratio_step_to_reasoning": 1.484272120513871, "adv/std_final_conf": 0.8112228512763977, "adv/std_reasoning": 0.7206644415855408, "adv/std_step_conf": 0.8595237731933594, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7223947550034506, "calib/avg_num_step_conf": 6.33984375, "calib/ece": 0.23864541832669328, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7330677290836654, "calib/gap": 0.3621325051759834, "calib/mean_conf": 0.7756175298804782, "calib/mu_c": 0.9054658385093167, "calib/mu_w": 0.5433333333333333, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18641434262948214, "calib/std_conf": 0.3977355188559799, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3938636085626912, "calib/step_q_c_n": 981.0, "calib/step_q_gap": 0.09011111635085323, "calib/step_q_w": 0.303752492211838, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 479.2578125, "completions/mean_terminated_length": 481.1372985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.15786666666666666, "grad_norm": 0.038541216403245926, "kl": 0.134246826171875, "learning_rate": 1.4444444444444445e-06, "loss": 0.0108, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03732118010520935, "mask/share_reasoning": 0.8160722255706787, "mask/share_step_conf": 0.14270035922527313, "num_tokens": 35209188.0, "reward": 0.68854820728302, "reward_std": 0.26796114444732666, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7455320358276367, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.30968934297561646, "step": 148 }, { "adv/mean_abs_final_conf": 0.6545625925064087, "adv/mean_abs_reasoning": 0.5150051116943359, "adv/mean_abs_step_conf": 0.6115411520004272, "adv/ratio_final_to_reasoning": 1.2709827099636681, "adv/ratio_step_to_reasoning": 1.187446761428238, "adv/std_final_conf": 0.871042788028717, "adv/std_reasoning": 0.7754599452018738, "adv/std_step_conf": 0.8278548121452332, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7188610883237626, "calib/avg_num_step_conf": 6.83984375, "calib/ece": 0.2547540983606557, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7295081967213115, "calib/gap": 0.4094722450095708, "calib/mean_conf": 0.7557377049180327, "calib/mu_c": 0.9336231884057972, "calib/mu_w": 0.5241509433962264, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2224590163934426, "calib/std_conf": 0.4156634397390709, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3885357142857143, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.11689312372588995, "calib/step_q_w": 0.27164259055982437, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2917.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 554.8671875, "completions/mean_terminated_length": 554.8671875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.15893333333333334, "grad_norm": 0.03235912695527077, "kl": 0.118988037109375, "learning_rate": 1.4166666666666667e-06, "loss": 0.0148, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03250977396965027, "mask/share_reasoning": 0.8357325196266174, "mask/share_step_conf": 0.1317577064037323, "num_tokens": 35455690.0, "reward": 0.6493133306503296, "reward_std": 0.2924065589904785, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7115804553031921, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.2886087894439697, "step": 149 }, { "adv/mean_abs_final_conf": 0.6426482796669006, "adv/mean_abs_reasoning": 0.5884519815444946, "adv/mean_abs_step_conf": 0.638965368270874, "adv/ratio_final_to_reasoning": 1.0920997801386587, "adv/ratio_step_to_reasoning": 1.08584113625346, "adv/std_final_conf": 0.8393579721450806, "adv/std_reasoning": 0.8099759817123413, "adv/std_step_conf": 0.8442474007606506, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6981818181818182, "calib/avg_num_step_conf": 6.90234375, "calib/ece": 0.308326530612245, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7224489795918367, "calib/gap": 0.33105387205387204, "calib/mean_conf": 0.748326530612245, "calib/mu_c": 0.896962962962963, "calib/mu_w": 0.5659090909090909, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25281632653061237, "calib/std_conf": 0.4185290757585551, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3906459387483356, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.13458196237038283, "calib/step_q_w": 0.25606397637795275, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 466.2890625, "completions/mean_terminated_length": 471.8182067871094, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.16, "grad_norm": 0.029916075989603996, "kl": 0.136199951171875, "learning_rate": 1.3888888888888892e-06, "loss": -0.007, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.038730621337890625, "mask/share_reasoning": 0.8002532720565796, "mask/share_step_conf": 0.1492973417043686, "num_tokens": 35680020.0, "reward": 0.5956156849861145, "reward_std": 0.30623573064804077, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6721328496932983, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.22222355008125305, "step": 150 }, { "adv/mean_abs_final_conf": 0.6335014700889587, "adv/mean_abs_reasoning": 0.490355908870697, "adv/mean_abs_step_conf": 0.5756443738937378, "adv/ratio_final_to_reasoning": 1.2919217625987822, "adv/ratio_step_to_reasoning": 1.173931757485012, "adv/std_final_conf": 0.8491879105567932, "adv/std_reasoning": 0.7576881647109985, "adv/std_step_conf": 0.8277603387832642, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7628980670282297, "calib/avg_num_step_conf": 6.53515625, "calib/ece": 0.26148148148148154, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.551440329218107, "calib/gap": 0.4950000000000001, "calib/mean_conf": 0.591111111111111, "calib/mu_c": 0.8783333333333334, "calib/mu_w": 0.3833333333333333, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21641975308641978, "calib/std_conf": 0.475687070772805, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.34270998415213944, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.09954299758784002, "calib/step_q_w": 0.24316698656429941, "calib/step_q_w_n": 1042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 552.0, "completions/mean_terminated_length": 554.1647338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.16106666666666666, "grad_norm": 0.03558661788702011, "kl": 0.1099700927734375, "learning_rate": 1.3611111111111112e-06, "loss": 0.0581, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.032656438648700714, "mask/share_reasoning": 0.8395566940307617, "mask/share_step_conf": 0.12388060241937637, "num_tokens": 35928356.0, "reward": 0.5653917193412781, "reward_std": 0.2516186833381653, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.7003406286239624, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.16169285774230957, "step": 151 }, { "adv/mean_abs_final_conf": 0.6823998093605042, "adv/mean_abs_reasoning": 0.6121759414672852, "adv/mean_abs_step_conf": 0.7128264307975769, "adv/ratio_final_to_reasoning": 1.114711904105385, "adv/ratio_step_to_reasoning": 1.1644143170491952, "adv/std_final_conf": 0.8533605337142944, "adv/std_reasoning": 0.8101208209991455, "adv/std_step_conf": 0.9067031741142273, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7731900826446282, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.2580894308943089, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5487804878048781, "calib/gap": 0.4535914049586778, "calib/mean_conf": 0.5917479674796747, "calib/mu_c": 0.8222314049586777, "calib/mu_w": 0.36863999999999997, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.17898373983739835, "calib/std_conf": 0.4751317676626246, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3809090909090909, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.10074807579030043, "calib/step_q_w": 0.2801610151187905, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2947.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 498.38671875, "completions/mean_terminated_length": 504.29644775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.16213333333333332, "grad_norm": 0.03145277500152588, "kl": 0.12725830078125, "learning_rate": 1.3333333333333334e-06, "loss": 0.0312, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03540301322937012, "mask/share_reasoning": 0.8154290914535522, "mask/share_step_conf": 0.13744911551475525, "num_tokens": 36161335.0, "reward": 0.6328657865524292, "reward_std": 0.2908143699169159, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7121277451515198, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2668851613998413, "step": 152 }, { "adv/mean_abs_final_conf": 0.6412728428840637, "adv/mean_abs_reasoning": 0.44410085678100586, "adv/mean_abs_step_conf": 0.680346667766571, "adv/ratio_final_to_reasoning": 1.4439801974988913, "adv/ratio_step_to_reasoning": 1.5319643215686527, "adv/std_final_conf": 0.8501389026641846, "adv/std_reasoning": 0.7393417954444885, "adv/std_step_conf": 0.9067375659942627, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6180758017492711, "calib/avg_num_step_conf": 6.5234375, "calib/ece": 0.350938775510204, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5428571428571428, "calib/gap": 0.20397959183673475, "calib/mean_conf": 0.5994285714285715, "calib/mu_c": 0.6810204081632654, "calib/mu_w": 0.4770408163265306, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1751836734693877, "calib/std_conf": 0.46074881376706234, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3553361334867664, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.0828191547102371, "calib/step_q_w": 0.2725169787765293, "calib/step_q_w_n": 801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 519.00390625, "completions/mean_terminated_length": 521.0392456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1632, "grad_norm": 0.029651034623384476, "kl": 0.1294708251953125, "learning_rate": 1.3055555555555556e-06, "loss": -0.0053, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.032412245869636536, "mask/share_reasoning": 0.8370093107223511, "mask/share_step_conf": 0.12667220830917358, "num_tokens": 36401520.0, "reward": 0.5983877182006836, "reward_std": 0.23342570662498474, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6178789138793945, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.27264654636383057, "step": 153 }, { "adv/mean_abs_final_conf": 0.6510841250419617, "adv/mean_abs_reasoning": 0.5419851541519165, "adv/mean_abs_step_conf": 0.6551204919815063, "adv/ratio_final_to_reasoning": 1.2012951278356696, "adv/ratio_step_to_reasoning": 1.20874250330089, "adv/std_final_conf": 0.8783076405525208, "adv/std_reasoning": 0.7930098176002502, "adv/std_step_conf": 0.8603218793869019, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.724609375, "calib/avg_num_step_conf": 6.1953125, "calib/ece": 0.2787295081967213, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4344262295081967, "calib/gap": 0.38049030172413784, "calib/mean_conf": 0.5097950819672131, "calib/mu_c": 0.7093965517241378, "calib/mu_w": 0.32890625, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15655737704918038, "calib/std_conf": 0.4688330899214785, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3723203170028819, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.09271269368449625, "calib/step_q_w": 0.2796076233183856, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 508.75390625, "completions/mean_terminated_length": 510.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.16426666666666667, "grad_norm": 0.029042229056358337, "kl": 0.1295013427734375, "learning_rate": 1.2777777777777779e-06, "loss": 0.0572, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.034418318420648575, "mask/share_reasoning": 0.8314560055732727, "mask/share_step_conf": 0.1302194446325302, "num_tokens": 36636201.0, "reward": 0.5632578730583191, "reward_std": 0.24759289622306824, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6856808662414551, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.15958485007286072, "step": 154 }, { "adv/mean_abs_final_conf": 0.7743602991104126, "adv/mean_abs_reasoning": 0.5609658360481262, "adv/mean_abs_step_conf": 0.5982762575149536, "adv/ratio_final_to_reasoning": 1.3804054531477365, "adv/ratio_step_to_reasoning": 1.0665110405469085, "adv/std_final_conf": 0.9183408617973328, "adv/std_reasoning": 0.7929477691650391, "adv/std_step_conf": 0.8279053568840027, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7444473823373876, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.2577235772357723, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3048780487804878, "calib/gap": 0.4012017451084082, "calib/mean_conf": 0.38227642276422763, "calib/mu_c": 0.5845081967213115, "calib/mu_w": 0.18330645161290324, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.07203252032520323, "calib/std_conf": 0.4505738220524032, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.31419947159841477, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.05493075515303664, "calib/step_q_w": 0.25926871644537813, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 452.453125, "completions/mean_terminated_length": 457.8182067871094, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.16533333333333333, "grad_norm": 0.028493136167526245, "kl": 0.1419219970703125, "learning_rate": 1.25e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03633648157119751, "mask/share_reasoning": 0.8170293569564819, "mask/share_step_conf": 0.13491545617580414, "num_tokens": 36859245.0, "reward": 0.5959603786468506, "reward_std": 0.2267536222934723, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6982449293136597, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.20695719122886658, "step": 155 }, { "adv/mean_abs_final_conf": 0.7039519548416138, "adv/mean_abs_reasoning": 0.5900039672851562, "adv/mean_abs_step_conf": 0.6915647983551025, "adv/ratio_final_to_reasoning": 1.1931308836460501, "adv/ratio_step_to_reasoning": 1.1721358443355359, "adv/std_final_conf": 0.891319990158081, "adv/std_reasoning": 0.8098912835121155, "adv/std_step_conf": 0.8760480880737305, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6519278043731401, "calib/avg_num_step_conf": 6.71875, "calib/ece": 0.34212851405622485, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4497991967871486, "calib/gap": 0.2729033510156554, "calib/mean_conf": 0.5066265060240964, "calib/mu_c": 0.6359541984732825, "calib/mu_w": 0.3630508474576271, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16132530120481922, "calib/std_conf": 0.4773528338369109, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31754936268829664, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.03076989944442271, "calib/step_q_w": 0.28677946324387393, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 487.29296875, "completions/mean_terminated_length": 489.2039489746094, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.1664, "grad_norm": 0.028783278539776802, "kl": 0.12811279296875, "learning_rate": 1.2222222222222223e-06, "loss": -0.0318, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03596906363964081, "mask/share_reasoning": 0.8175702095031738, "mask/share_step_conf": 0.14255444705486298, "num_tokens": 37088752.0, "reward": 0.6223392486572266, "reward_std": 0.2575696110725403, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6405097246170044, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.30729368329048157, "step": 156 }, { "adv/mean_abs_final_conf": 0.6383641958236694, "adv/mean_abs_reasoning": 0.5246529579162598, "adv/mean_abs_step_conf": 0.6283718943595886, "adv/ratio_final_to_reasoning": 1.2167361037266071, "adv/ratio_step_to_reasoning": 1.1976905588320033, "adv/std_final_conf": 0.8655281662940979, "adv/std_reasoning": 0.8098201751708984, "adv/std_step_conf": 0.8603395819664001, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7785322451164036, "calib/avg_num_step_conf": 6.86328125, "calib/ece": 0.2307630522088353, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4979919678714859, "calib/gap": 0.48737289269467476, "calib/mean_conf": 0.5485943775100403, "calib/mu_c": 0.7462837837837837, "calib/mu_w": 0.2589108910891089, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0924899598393574, "calib/std_conf": 0.47510053060931756, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36645461928934014, "calib/step_q_c_n": 985.0, "calib/step_q_gap": 0.05500099234633493, "calib/step_q_w": 0.3114536269430052, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 510.42578125, "completions/mean_terminated_length": 510.42578125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.16746666666666668, "grad_norm": 0.026828613132238388, "kl": 0.1251220703125, "learning_rate": 1.1944444444444446e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03643594682216644, "mask/share_reasoning": 0.8140197992324829, "mask/share_step_conf": 0.14954423904418945, "num_tokens": 37323149.0, "reward": 0.6376811861991882, "reward_std": 0.22104200720787048, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7451468706130981, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.22005924582481384, "step": 157 }, { "adv/mean_abs_final_conf": 0.6425611972808838, "adv/mean_abs_reasoning": 0.5250939726829529, "adv/mean_abs_step_conf": 0.6272925734519958, "adv/ratio_final_to_reasoning": 1.2237070518972737, "adv/ratio_step_to_reasoning": 1.1946291637035216, "adv/std_final_conf": 0.8290287256240845, "adv/std_reasoning": 0.7754476070404053, "adv/std_step_conf": 0.8440023064613342, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6657873376623378, "calib/avg_num_step_conf": 6.73828125, "calib/ece": 0.32276000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.504, "calib/gap": 0.2951569264069264, "calib/mean_conf": 0.5614, "calib/mu_c": 0.6747402597402598, "calib/mu_w": 0.37958333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13408, "calib/std_conf": 0.470522305528654, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3874596049197861, "calib/step_q_c_n": 935.0, "calib/step_q_gap": 0.043700237831178546, "calib/step_q_w": 0.34375936708860755, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 475.31640625, "completions/mean_terminated_length": 479.0590515136719, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.16853333333333334, "grad_norm": 0.03342423215508461, "kl": 0.144012451171875, "learning_rate": 1.1666666666666668e-06, "loss": 0.091, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037381358444690704, "mask/share_reasoning": 0.8066922426223755, "mask/share_step_conf": 0.14811387658119202, "num_tokens": 37550070.0, "reward": 0.62567138671875, "reward_std": 0.22000226378440857, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.662811279296875, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.27290648221969604, "step": 158 }, { "adv/mean_abs_final_conf": 0.6379877328872681, "adv/mean_abs_reasoning": 0.4578869342803955, "adv/mean_abs_step_conf": 0.6275926828384399, "adv/ratio_final_to_reasoning": 1.3933302855429033, "adv/ratio_step_to_reasoning": 1.3706280652553453, "adv/std_final_conf": 0.8404585719108582, "adv/std_reasoning": 0.720635712146759, "adv/std_step_conf": 0.8599308729171753, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.728595501976797, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.29396, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.436, "calib/gap": 0.36862207531272284, "calib/mean_conf": 0.5010800000000001, "calib/mu_c": 0.6647482014388489, "calib/mu_w": 0.2961261261261261, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11952, "calib/std_conf": 0.47024082510985793, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.37414948453608243, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.10081394952947742, "calib/step_q_w": 0.273335535006605, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 461.265625, "completions/mean_terminated_length": 461.265625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1696, "grad_norm": 0.03322465345263481, "kl": 0.1392364501953125, "learning_rate": 1.138888888888889e-06, "loss": 0.075, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.038932427763938904, "mask/share_reasoning": 0.8186612725257874, "mask/share_step_conf": 0.14240629971027374, "num_tokens": 37772938.0, "reward": 0.6667243242263794, "reward_std": 0.2159826159477234, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6943285465240479, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.335213840007782, "step": 159 }, { "adv/mean_abs_final_conf": 0.7036552429199219, "adv/mean_abs_reasoning": 0.5878018140792847, "adv/mean_abs_step_conf": 0.6873990297317505, "adv/ratio_final_to_reasoning": 1.1970960722911457, "adv/ratio_step_to_reasoning": 1.1694401297629065, "adv/std_final_conf": 0.8834091424942017, "adv/std_reasoning": 0.8100034594535828, "adv/std_step_conf": 0.8916254043579102, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7044484460450847, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.2904897959183674, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4122448979591837, "calib/gap": 0.3674369747899159, "calib/mean_conf": 0.45081632653061227, "calib/mu_c": 0.6292857142857142, "calib/mu_w": 0.26184873949579834, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1135102040816327, "calib/std_conf": 0.47564451111931716, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3385369127516778, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.05909707872678155, "calib/step_q_w": 0.27943983402489625, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 481.453125, "completions/mean_terminated_length": 491.0438537597656, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.17066666666666666, "grad_norm": 0.02368181012570858, "kl": 0.138092041015625, "learning_rate": 1.111111111111111e-06, "loss": -0.0589, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03467347100377083, "mask/share_reasoning": 0.812345027923584, "mask/share_step_conf": 0.1334502398967743, "num_tokens": 38001030.0, "reward": 0.5915138721466064, "reward_std": 0.2379169464111328, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.673277735710144, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.21990624070167542, "step": 160 }, { "adv/mean_abs_final_conf": 0.642875611782074, "adv/mean_abs_reasoning": 0.36083823442459106, "adv/mean_abs_step_conf": 0.6174168586730957, "adv/ratio_final_to_reasoning": 1.7816172191598056, "adv/ratio_step_to_reasoning": 1.7110627416123363, "adv/std_final_conf": 0.8336023688316345, "adv/std_reasoning": 0.6403910517692566, "adv/std_step_conf": 0.8441663384437561, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7458947206875384, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.31699604743083004, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.39920948616600793, "calib/gap": 0.4325943830570903, "calib/mean_conf": 0.46490118577075096, "calib/mu_c": 0.588011049723757, "calib/mu_w": 0.15541666666666668, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03324110671936758, "calib/std_conf": 0.4706791096148659, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36651016456921587, "calib/step_q_c_n": 1033.0, "calib/step_q_gap": 0.10200585422438829, "calib/step_q_w": 0.2645043103448276, "calib/step_q_w_n": 464.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 449.60546875, "completions/mean_terminated_length": 449.60546875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.17173333333333332, "grad_norm": 0.0701291561126709, "kl": 0.1392822265625, "learning_rate": 1.0833333333333335e-06, "loss": 0.097, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03925294801592827, "mask/share_reasoning": 0.8194358944892883, "mask/share_step_conf": 0.1413111388683319, "num_tokens": 38220049.0, "reward": 0.6575615406036377, "reward_std": 0.2015339732170105, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.6762851476669312, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.30055665969848633, "step": 161 }, { "adv/mean_abs_final_conf": 0.605049729347229, "adv/mean_abs_reasoning": 0.5026243925094604, "adv/mean_abs_step_conf": 0.7116568088531494, "adv/ratio_final_to_reasoning": 1.2037810706447971, "adv/ratio_step_to_reasoning": 1.4158819576981723, "adv/std_final_conf": 0.812254786491394, "adv/std_reasoning": 0.7575938105583191, "adv/std_step_conf": 0.9068528413772583, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7510630758327428, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.27243027888446214, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.43729907866761164, "calib/mean_conf": 0.5446215139442231, "calib/mu_c": 0.692710843373494, "calib/mu_w": 0.25541176470588234, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07784860557768924, "calib/std_conf": 0.47993541665380784, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35390433691756273, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.06313488753923235, "calib/step_q_w": 0.2907694493783304, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 426.2578125, "completions/mean_terminated_length": 426.2578125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1728, "grad_norm": 0.03929247707128525, "kl": 0.1557769775390625, "learning_rate": 1.0555555555555557e-06, "loss": 0.0267, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.039860356599092484, "mask/share_reasoning": 0.8166975975036621, "mask/share_step_conf": 0.1434420347213745, "num_tokens": 38433315.0, "reward": 0.6589879989624023, "reward_std": 0.21501243114471436, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7137320041656494, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.278462678194046, "step": 162 }, { "adv/mean_abs_final_conf": 0.6260947585105896, "adv/mean_abs_reasoning": 0.4744158685207367, "adv/mean_abs_step_conf": 0.5998172163963318, "adv/ratio_final_to_reasoning": 1.3197171512470667, "adv/ratio_step_to_reasoning": 1.2643278949892753, "adv/std_final_conf": 0.8409026861190796, "adv/std_reasoning": 0.7394002079963684, "adv/std_step_conf": 0.811038076877594, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7668760330578512, "calib/avg_num_step_conf": 7.13671875, "calib/ece": 0.21711382113821134, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3902439024390244, "calib/gap": 0.46563239669421497, "calib/mean_conf": 0.47808943089430894, "calib/mu_c": 0.7071200000000001, "calib/mu_w": 0.24148760330578511, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09353658536585362, "calib/std_conf": 0.4620438952175499, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.36207522697795075, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.11590935020069953, "calib/step_q_w": 0.2461658767772512, "calib/step_q_w_n": 1055.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 517.96875, "completions/mean_terminated_length": 524.1107177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 57.0, "epoch": 0.17386666666666667, "grad_norm": 0.038553379476070404, "kl": 0.12386322021484375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0309, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036159660667181015, "mask/share_reasoning": 0.8097751140594482, "mask/share_step_conf": 0.14234648644924164, "num_tokens": 38670747.0, "reward": 0.6114170551300049, "reward_std": 0.20468053221702576, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7306042909622192, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.20394855737686157, "step": 163 }, { "adv/mean_abs_final_conf": 0.617720365524292, "adv/mean_abs_reasoning": 0.522842288017273, "adv/mean_abs_step_conf": 0.5830103158950806, "adv/ratio_final_to_reasoning": 1.181465959585665, "adv/ratio_step_to_reasoning": 1.1150787326441733, "adv/std_final_conf": 0.8278981447219849, "adv/std_reasoning": 0.7927848100662231, "adv/std_step_conf": 0.8276817202568054, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7556065573770491, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.24781376518218623, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3562753036437247, "calib/gap": 0.4245731147540983, "calib/mean_conf": 0.4276518218623482, "calib/mu_c": 0.6373599999999999, "calib/mu_w": 0.21278688524590164, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.08469635627530366, "calib/std_conf": 0.4626828911029161, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.33996546329723226, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.052494465617417885, "calib/step_q_w": 0.2874709976798144, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 546.37890625, "completions/mean_terminated_length": 548.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.17493333333333333, "grad_norm": 0.030100451782345772, "kl": 0.1212615966796875, "learning_rate": 1.0000000000000002e-06, "loss": -0.0006, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03141268342733383, "mask/share_reasoning": 0.8353418111801147, "mask/share_step_conf": 0.12933926284313202, "num_tokens": 38916756.0, "reward": 0.6199631690979004, "reward_std": 0.2199994921684265, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7159785032272339, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.23254159092903137, "step": 164 }, { "adv/mean_abs_final_conf": 0.6197268962860107, "adv/mean_abs_reasoning": 0.4494856894016266, "adv/mean_abs_step_conf": 0.5888758897781372, "adv/ratio_final_to_reasoning": 1.3787466673544515, "adv/ratio_step_to_reasoning": 1.3101104299050599, "adv/std_final_conf": 0.842979371547699, "adv/std_reasoning": 0.7206093072891235, "adv/std_step_conf": 0.8442461490631104, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7759187620889749, "calib/avg_num_step_conf": 6.72265625, "calib/ece": 0.23342629482071703, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.44621513944223107, "calib/gap": 0.48048742746615103, "calib/mean_conf": 0.49972111553784865, "calib/mu_c": 0.7696363636363638, "calib/mu_w": 0.28914893617021276, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1474501992031872, "calib/std_conf": 0.4777323613447663, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.35916215429403203, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.08272056822053109, "calib/step_q_w": 0.27644158607350094, "calib/step_q_w_n": 1034.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 519.453125, "completions/mean_terminated_length": 519.453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.176, "grad_norm": 0.0253030676394701, "kl": 0.1275787353515625, "learning_rate": 9.722222222222224e-07, "loss": -0.0169, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034441519528627396, "mask/share_reasoning": 0.8312867879867554, "mask/share_step_conf": 0.13427163660526276, "num_tokens": 39155312.0, "reward": 0.6153743863105774, "reward_std": 0.22514969110488892, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7435730695724487, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.20514453947544098, "step": 165 }, { "adv/mean_abs_final_conf": 0.5640722513198853, "adv/mean_abs_reasoning": 0.4528849720954895, "adv/mean_abs_step_conf": 0.6615427732467651, "adv/ratio_final_to_reasoning": 1.2455088732795316, "adv/ratio_step_to_reasoning": 1.4607302383777945, "adv/std_final_conf": 0.81165611743927, "adv/std_reasoning": 0.7392885088920593, "adv/std_step_conf": 0.876011312007904, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8413442227875217, "calib/avg_num_step_conf": 7.0703125, "calib/ece": 0.16521912350597606, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5338645418326693, "calib/gap": 0.6092703173115543, "calib/mean_conf": 0.5935059760956175, "calib/mu_c": 0.8289610389610389, "calib/mu_w": 0.21969072164948453, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07258964143426293, "calib/std_conf": 0.4716247864641066, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.35446376811594205, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.07381731650303885, "calib/step_q_w": 0.2806464516129032, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 524.66796875, "completions/mean_terminated_length": 526.7255249023438, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.17706666666666668, "grad_norm": 0.029192175716161728, "kl": 0.119293212890625, "learning_rate": 9.444444444444445e-07, "loss": -0.0501, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03343809396028519, "mask/share_reasoning": 0.8208543062210083, "mask/share_step_conf": 0.14180134236812592, "num_tokens": 39395811.0, "reward": 0.735075831413269, "reward_std": 0.23994724452495575, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.8127949237823486, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.340950608253479, "step": 166 }, { "adv/mean_abs_final_conf": 0.580825924873352, "adv/mean_abs_reasoning": 0.4305885434150696, "adv/mean_abs_step_conf": 0.6823903918266296, "adv/ratio_final_to_reasoning": 1.3489117017994132, "adv/ratio_step_to_reasoning": 1.584785295062608, "adv/std_final_conf": 0.8161166906356812, "adv/std_reasoning": 0.7015215158462524, "adv/std_step_conf": 0.875901997089386, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6656862745098039, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.23956175298804777, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6733067729083665, "calib/gap": 0.3225141612200434, "calib/mean_conf": 0.7425099601593625, "calib/mu_c": 0.8465882352941175, "calib/mu_w": 0.5240740740740741, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15239043824701193, "calib/std_conf": 0.40117714602209914, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.37194528650646946, "calib/step_q_c_n": 1082.0, "calib/step_q_gap": 0.018180104320234647, "calib/step_q_w": 0.3537651821862348, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 469.33984375, "completions/mean_terminated_length": 474.9051513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.17813333333333334, "grad_norm": 0.028904518112540245, "kl": 0.1312255859375, "learning_rate": 9.166666666666666e-07, "loss": -0.0316, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036311276257038116, "mask/share_reasoning": 0.820502519607544, "mask/share_step_conf": 0.13146746158599854, "num_tokens": 39621570.0, "reward": 0.6593512892723083, "reward_std": 0.21816681325435638, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7424285411834717, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2473677694797516, "step": 167 }, { "adv/mean_abs_final_conf": 0.640915036201477, "adv/mean_abs_reasoning": 0.5599716901779175, "adv/mean_abs_step_conf": 0.6014311909675598, "adv/ratio_final_to_reasoning": 1.1445489967498925, "adv/ratio_step_to_reasoning": 1.0740385657290452, "adv/std_final_conf": 0.8428418040275574, "adv/std_reasoning": 0.7929864525794983, "adv/std_step_conf": 0.8278246521949768, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7442237249929557, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.2296356275303644, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5789473684210527, "calib/gap": 0.39014652014652007, "calib/mean_conf": 0.6580566801619433, "calib/mu_c": 0.8017948717948717, "calib/mu_w": 0.4116483516483517, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12805668016194335, "calib/std_conf": 0.4367855865401485, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.38553215859030837, "calib/step_q_c_n": 908.0, "calib/step_q_gap": 0.08596444558582406, "calib/step_q_w": 0.2995677130044843, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 503.46875, "completions/mean_terminated_length": 505.44317626953125, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.1792, "grad_norm": 0.026781795546412468, "kl": 0.1285858154296875, "learning_rate": 8.88888888888889e-07, "loss": -0.0292, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03355613350868225, "mask/share_reasoning": 0.8358818292617798, "mask/share_step_conf": 0.12665575742721558, "num_tokens": 39855130.0, "reward": 0.6448159217834473, "reward_std": 0.2607659101486206, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7229546904563904, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.25261467695236206, "step": 168 }, { "adv/mean_abs_final_conf": 0.6007251739501953, "adv/mean_abs_reasoning": 0.4678575396537781, "adv/mean_abs_step_conf": 0.5844001770019531, "adv/ratio_final_to_reasoning": 1.2839916492416503, "adv/ratio_step_to_reasoning": 1.2490985555868532, "adv/std_final_conf": 0.8269153237342834, "adv/std_reasoning": 0.7392950057983398, "adv/std_step_conf": 0.794001579284668, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7719065656565657, "calib/avg_num_step_conf": 5.87109375, "calib/ece": 0.24932539682539692, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5753968253968254, "calib/gap": 0.4322878787878788, "calib/mean_conf": 0.6437698412698413, "calib/mu_c": 0.8496212121212121, "calib/mu_w": 0.41733333333333333, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18464285714285722, "calib/std_conf": 0.448004033122053, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42760922155688624, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.11284874251497007, "calib/step_q_w": 0.3147604790419162, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 481.96484375, "completions/mean_terminated_length": 485.75982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.18026666666666666, "grad_norm": 0.022836333140730858, "kl": 0.131591796875, "learning_rate": 8.611111111111112e-07, "loss": -0.0533, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03490082174539566, "mask/share_reasoning": 0.8294519782066345, "mask/share_step_conf": 0.1278347223997116, "num_tokens": 40082697.0, "reward": 0.6543014645576477, "reward_std": 0.24539294838905334, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7355679869651794, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.273034930229187, "step": 169 }, { "adv/mean_abs_final_conf": 0.6357203722000122, "adv/mean_abs_reasoning": 0.48368558287620544, "adv/mean_abs_step_conf": 0.6828759908676147, "adv/ratio_final_to_reasoning": 1.314325658457177, "adv/ratio_step_to_reasoning": 1.411817955802892, "adv/std_final_conf": 0.8231263756752014, "adv/std_reasoning": 0.7206954956054688, "adv/std_step_conf": 0.8760550022125244, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8362116228070176, "calib/avg_num_step_conf": 6.8671875, "calib/ece": 0.19250000000000003, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6088709677419355, "calib/gap": 0.4952083333333333, "calib/mean_conf": 0.6858064516129032, "calib/mu_c": 0.8775, "calib/mu_w": 0.38229166666666664, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.13270161290322585, "calib/std_conf": 0.4299129633860893, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.39136444652908065, "calib/step_q_c_n": 1066.0, "calib/step_q_gap": 0.07494667196260663, "calib/step_q_w": 0.316417774566474, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 524.5078125, "completions/mean_terminated_length": 524.5078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.18133333333333335, "grad_norm": 0.021576346829533577, "kl": 0.121429443359375, "learning_rate": 8.333333333333333e-07, "loss": 0.0017, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031903624534606934, "mask/share_reasoning": 0.8302265405654907, "mask/share_step_conf": 0.13786983489990234, "num_tokens": 40321123.0, "reward": 0.7102303504943848, "reward_std": 0.27406108379364014, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7745558023452759, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.3349672555923462, "step": 170 }, { "adv/mean_abs_final_conf": 0.6799715757369995, "adv/mean_abs_reasoning": 0.5418053865432739, "adv/mean_abs_step_conf": 0.6000851988792419, "adv/ratio_final_to_reasoning": 1.255010733789909, "adv/ratio_step_to_reasoning": 1.107565952246791, "adv/std_final_conf": 0.9059799909591675, "adv/std_reasoning": 0.7928255796432495, "adv/std_step_conf": 0.8278549909591675, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6815433070866141, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.30476190476190484, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5317460317460317, "calib/gap": 0.31031055118110235, "calib/mean_conf": 0.6216666666666667, "calib/mu_c": 0.7755905511811023, "calib/mu_w": 0.46527999999999997, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21123015873015882, "calib/std_conf": 0.4468030175680739, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3923847167325428, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.056257109370579705, "calib/step_q_w": 0.3361276073619631, "calib/step_q_w_n": 815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 476.28515625, "completions/mean_terminated_length": 480.0354309082031, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.1824, "grad_norm": 0.027811603620648384, "kl": 0.11895751953125, "learning_rate": 8.055555555555557e-07, "loss": 0.0278, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035914987325668335, "mask/share_reasoning": 0.823911190032959, "mask/share_step_conf": 0.1323612779378891, "num_tokens": 40549948.0, "reward": 0.6012252569198608, "reward_std": 0.2405487596988678, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6732117533683777, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.23470759391784668, "step": 171 }, { "adv/mean_abs_final_conf": 0.629769504070282, "adv/mean_abs_reasoning": 0.4914962351322174, "adv/mean_abs_step_conf": 0.6660798788070679, "adv/ratio_final_to_reasoning": 1.2813312881244505, "adv/ratio_step_to_reasoning": 1.355208506587819, "adv/std_final_conf": 0.8284575939178467, "adv/std_reasoning": 0.7393813133239746, "adv/std_step_conf": 0.8759723901748657, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6436507936507937, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.30389558232931724, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6867469879518072, "calib/gap": 0.1468549783549783, "calib/mean_conf": 0.7676706827309236, "calib/mu_c": 0.8172121212121213, "calib/mu_w": 0.670357142857143, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20445783132530124, "calib/std_conf": 0.3773030963163373, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3772694296141271, "calib/step_q_c_n": 991.0, "calib/step_q_gap": 0.04662322606000757, "calib/step_q_w": 0.33064620355411956, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 465.9765625, "completions/mean_terminated_length": 469.6456604003906, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.18346666666666667, "grad_norm": 0.02604655548930168, "kl": 0.1365203857421875, "learning_rate": 7.777777777777779e-07, "loss": 0.0037, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03690432012081146, "mask/share_reasoning": 0.8133035898208618, "mask/share_step_conf": 0.14197959005832672, "num_tokens": 40772590.0, "reward": 0.6894025802612305, "reward_std": 0.2624654471874237, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6662933826446533, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.3898555040359497, "step": 172 }, { "adv/mean_abs_final_conf": 0.6197570562362671, "adv/mean_abs_reasoning": 0.5932114720344543, "adv/mean_abs_step_conf": 0.6575195789337158, "adv/ratio_final_to_reasoning": 1.0447489393803748, "adv/ratio_step_to_reasoning": 1.1084067148578785, "adv/std_final_conf": 0.8254387974739075, "adv/std_reasoning": 0.8099361658096313, "adv/std_step_conf": 0.8760329484939575, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7400977366255145, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.25285714285714284, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7261904761904762, "calib/gap": 0.36671296296296285, "calib/mean_conf": 0.7804761904761905, "calib/mu_c": 0.9376388888888888, "calib/mu_w": 0.570925925925926, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2309523809523809, "calib/std_conf": 0.38551317508389427, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.36691914022517913, "calib/step_q_c_n": 977.0, "calib/step_q_gap": -0.0015090206943610962, "calib/step_q_w": 0.3684281609195402, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 521.0234375, "completions/mean_terminated_length": 523.0667114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.18453333333333333, "grad_norm": 0.02251630648970604, "kl": 0.126922607421875, "learning_rate": 7.5e-07, "loss": 0.0376, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03766322880983353, "mask/share_reasoning": 0.8203516006469727, "mask/share_step_conf": 0.13807891309261322, "num_tokens": 41009132.0, "reward": 0.6263689398765564, "reward_std": 0.283684641122818, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7153265476226807, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2311612367630005, "step": 173 }, { "adv/mean_abs_final_conf": 0.7387194633483887, "adv/mean_abs_reasoning": 0.6263118982315063, "adv/mean_abs_step_conf": 0.5999409556388855, "adv/ratio_final_to_reasoning": 1.1794753786959555, "adv/ratio_step_to_reasoning": 0.9578948720803748, "adv/std_final_conf": 0.8921695351600647, "adv/std_reasoning": 0.8266700506210327, "adv/std_step_conf": 0.8278965353965759, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6318305084745762, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.3406559670781892, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5679012345679012, "calib/gap": 0.24506677966101698, "calib/mean_conf": 0.6300436213991769, "calib/mu_c": 0.7561067796610169, "calib/mu_w": 0.5110399999999999, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.242551440329218, "calib/std_conf": 0.45110726096520265, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3871065217391304, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.05627544202167328, "calib/step_q_w": 0.3308310797174571, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 567.32421875, "completions/mean_terminated_length": 567.32421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1856, "grad_norm": 0.050442811101675034, "kl": 0.1150360107421875, "learning_rate": 7.222222222222222e-07, "loss": 0.0306, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03203814476728439, "mask/share_reasoning": 0.8387712240219116, "mask/share_step_conf": 0.12919063866138458, "num_tokens": 41258599.0, "reward": 0.5362797975540161, "reward_std": 0.29510360956192017, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6116565465927124, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.17965298891067505, "step": 174 }, { "adv/mean_abs_final_conf": 0.7071558237075806, "adv/mean_abs_reasoning": 0.5694407224655151, "adv/mean_abs_step_conf": 0.592195987701416, "adv/ratio_final_to_reasoning": 1.2418427341230507, "adv/ratio_step_to_reasoning": 1.0399607269697486, "adv/std_final_conf": 0.8904350399971008, "adv/std_reasoning": 0.775518000125885, "adv/std_step_conf": 0.8421236872673035, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.741650632215503, "calib/avg_num_step_conf": 6.86328125, "calib/ece": 0.28263374485596704, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5061728395061729, "calib/gap": 0.3985417811984606, "calib/mean_conf": 0.6176954732510288, "calib/mu_c": 0.8407476635514018, "calib/mu_w": 0.4422058823529412, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22999999999999998, "calib/std_conf": 0.4500923977615245, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39407936507936503, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.07855762594893029, "calib/step_q_w": 0.31552173913043474, "calib/step_q_w_n": 1127.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 529.8125, "completions/mean_terminated_length": 540.3665771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.18666666666666668, "grad_norm": 0.03564886003732681, "kl": 0.123687744140625, "learning_rate": 6.944444444444446e-07, "loss": -0.0416, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.033526644110679626, "mask/share_reasoning": 0.8139470815658569, "mask/share_step_conf": 0.13299497961997986, "num_tokens": 41500055.0, "reward": 0.5531408190727234, "reward_std": 0.2512364983558655, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6795945167541504, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.15324954688549042, "step": 175 }, { "adv/mean_abs_final_conf": 0.6224998831748962, "adv/mean_abs_reasoning": 0.49394357204437256, "adv/mean_abs_step_conf": 0.7324033975601196, "adv/ratio_final_to_reasoning": 1.2602651768469113, "adv/ratio_step_to_reasoning": 1.4827673422872794, "adv/std_final_conf": 0.822606086730957, "adv/std_reasoning": 0.7394349575042725, "adv/std_step_conf": 0.9069081544876099, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7572134387351779, "calib/avg_num_step_conf": 6.53125, "calib/ece": 0.24838056680161957, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6032388663967612, "calib/gap": 0.4230645586297761, "calib/mean_conf": 0.6781781376518218, "calib/mu_c": 0.8751515151515152, "calib/mu_w": 0.45208695652173914, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19607287449392724, "calib/std_conf": 0.43096107718488164, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4410749004227213, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.13907107528610924, "calib/step_q_w": 0.30200382513661206, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 517.1796875, "completions/mean_terminated_length": 521.251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.18773333333333334, "grad_norm": 0.03228491172194481, "kl": 0.1517181396484375, "learning_rate": 6.666666666666667e-07, "loss": -0.0619, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03568899258971214, "mask/share_reasoning": 0.8204825520515442, "mask/share_step_conf": 0.13601595163345337, "num_tokens": 41736517.0, "reward": 0.637576699256897, "reward_std": 0.27080726623535156, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7287644147872925, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.25029516220092773, "step": 176 }, { "adv/mean_abs_final_conf": 0.6451828479766846, "adv/mean_abs_reasoning": 0.5176087617874146, "adv/mean_abs_step_conf": 0.6049473881721497, "adv/ratio_final_to_reasoning": 1.2464681736621483, "adv/ratio_step_to_reasoning": 1.1687348299189064, "adv/std_final_conf": 0.8734248280525208, "adv/std_reasoning": 0.7754340767860413, "adv/std_step_conf": 0.8263915181159973, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7547961630695443, "calib/avg_num_step_conf": 6.25, "calib/ece": 0.23935222672064776, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5506072874493927, "calib/gap": 0.3819984012789766, "calib/mean_conf": 0.6505263157894737, "calib/mu_c": 0.8175539568345322, "calib/mu_w": 0.43555555555555564, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16356275303643725, "calib/std_conf": 0.4316742266703525, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43631858958068614, "calib/step_q_c_n": 787.0, "calib/step_q_gap": 0.11396926608745123, "calib/step_q_w": 0.3223493234932349, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 510.12890625, "completions/mean_terminated_length": 510.12890625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1888, "grad_norm": 0.03363857790827751, "kl": 0.116729736328125, "learning_rate": 6.388888888888889e-07, "loss": 0.0482, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03490570932626724, "mask/share_reasoning": 0.8320082426071167, "mask/share_step_conf": 0.13308599591255188, "num_tokens": 41970942.0, "reward": 0.6575802564620972, "reward_std": 0.2575252056121826, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7176925539970398, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2959054410457611, "step": 177 }, { "adv/mean_abs_final_conf": 0.6610404253005981, "adv/mean_abs_reasoning": 0.554440975189209, "adv/mean_abs_step_conf": 0.6860345602035522, "adv/ratio_final_to_reasoning": 1.1922647403089408, "adv/ratio_step_to_reasoning": 1.2373446244109854, "adv/std_final_conf": 0.8628710508346558, "adv/std_reasoning": 0.7928512692451477, "adv/std_step_conf": 0.9068071246147156, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8206493506493506, "calib/avg_num_step_conf": 6.1015625, "calib/ece": 0.1695628, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.544, "calib/gap": 0.49752584415584417, "calib/mean_conf": 0.6565172, "calib/mu_c": 0.8754285714285714, "calib/mu_w": 0.3779027272727273, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13304000000000002, "calib/std_conf": 0.4231773340623999, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3906520215633423, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.07616665570968367, "calib/step_q_w": 0.3144853658536586, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 470.98046875, "completions/mean_terminated_length": 472.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.18986666666666666, "grad_norm": 0.03201735392212868, "kl": 0.12847900390625, "learning_rate": 6.111111111111112e-07, "loss": -0.041, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03747853636741638, "mask/share_reasoning": 0.8215433359146118, "mask/share_step_conf": 0.13707192242145538, "num_tokens": 42197585.0, "reward": 0.6837292909622192, "reward_std": 0.269634872674942, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7913926839828491, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2713782787322998, "step": 178 }, { "adv/mean_abs_final_conf": 0.6383603811264038, "adv/mean_abs_reasoning": 0.5476167798042297, "adv/mean_abs_step_conf": 0.6959847807884216, "adv/ratio_final_to_reasoning": 1.1657063929900293, "adv/ratio_step_to_reasoning": 1.2709339933616217, "adv/std_final_conf": 0.856768012046814, "adv/std_reasoning": 0.7755259871482849, "adv/std_step_conf": 0.8761194348335266, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.771365787694388, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.18987854251012146, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5587044534412956, "calib/gap": 0.4586267748478702, "calib/mean_conf": 0.6524696356275304, "calib/mu_c": 0.8418620689655172, "calib/mu_w": 0.383235294117647, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12765182186234816, "calib/std_conf": 0.4330748433333519, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43302367531003383, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.15888579892334603, "calib/step_q_w": 0.2741378763866878, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 513.8515625, "completions/mean_terminated_length": 515.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.19093333333333334, "grad_norm": 0.029710792005062103, "kl": 0.1192779541015625, "learning_rate": 5.833333333333334e-07, "loss": -0.0479, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03475484997034073, "mask/share_reasoning": 0.8319066762924194, "mask/share_step_conf": 0.12943220138549805, "num_tokens": 42435395.0, "reward": 0.6647284030914307, "reward_std": 0.2787940502166748, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7569183111190796, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2662884593009949, "step": 179 }, { "adv/mean_abs_final_conf": 0.6357666254043579, "adv/mean_abs_reasoning": 0.4256601333618164, "adv/mean_abs_step_conf": 0.6477457284927368, "adv/ratio_final_to_reasoning": 1.49360152754533, "adv/ratio_step_to_reasoning": 1.5217439401169968, "adv/std_final_conf": 0.8565616607666016, "adv/std_reasoning": 0.7205907106399536, "adv/std_step_conf": 0.8913508653640747, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7624752475247525, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.22825498007968129, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5856573705179283, "calib/gap": 0.35569293729372947, "calib/mean_conf": 0.6965258964143425, "calib/mu_c": 0.8396533333333334, "calib/mu_w": 0.4839603960396039, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16358565737051795, "calib/std_conf": 0.40609752219946754, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.36742843232716654, "calib/step_q_c_n": 1027.0, "calib/step_q_gap": 0.05588553724084522, "calib/step_q_w": 0.3115428950863213, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 573.54296875, "completions/mean_terminated_length": 575.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.192, "grad_norm": 0.02507541887462139, "kl": 0.116607666015625, "learning_rate": 5.555555555555555e-07, "loss": -0.0122, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0303267240524292, "mask/share_reasoning": 0.8349812030792236, "mask/share_step_conf": 0.13078580796718597, "num_tokens": 42686078.0, "reward": 0.6616320610046387, "reward_std": 0.24009902775287628, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.729920506477356, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.28240612149238586, "step": 180 }, { "adv/mean_abs_final_conf": 0.6857171058654785, "adv/mean_abs_reasoning": 0.47207292914390564, "adv/mean_abs_step_conf": 0.5858733654022217, "adv/ratio_final_to_reasoning": 1.4525660412449664, "adv/ratio_step_to_reasoning": 1.2410653719642233, "adv/std_final_conf": 0.8858541250228882, "adv/std_reasoning": 0.7575730085372925, "adv/std_step_conf": 0.8279093503952026, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7807853602744947, "calib/avg_num_step_conf": 5.87109375, "calib/ece": 0.21561752988047814, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.40606493836573904, "calib/mean_conf": 0.6454980079681274, "calib/mu_c": 0.8428682170542636, "calib/mu_w": 0.4368032786885246, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17358565737051795, "calib/std_conf": 0.4147202424262693, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43673177083333337, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.09254524022108845, "calib/step_q_w": 0.3441865306122449, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 467.3671875, "completions/mean_terminated_length": 469.2000427246094, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.19306666666666666, "grad_norm": 0.048911020159721375, "kl": 0.134918212890625, "learning_rate": 5.277777777777779e-07, "loss": 0.0121, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036969106644392014, "mask/share_reasoning": 0.8241372108459473, "mask/share_step_conf": 0.13498741388320923, "num_tokens": 42911988.0, "reward": 0.6410413980484009, "reward_std": 0.27544742822647095, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7488523721694946, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2363554835319519, "step": 181 }, { "adv/mean_abs_final_conf": 0.6549832820892334, "adv/mean_abs_reasoning": 0.3934985399246216, "adv/mean_abs_step_conf": 0.6373138427734375, "adv/ratio_final_to_reasoning": 1.664512610935486, "adv/ratio_step_to_reasoning": 1.6196091677888336, "adv/std_final_conf": 0.864326536655426, "adv/std_reasoning": 0.6815158724784851, "adv/std_step_conf": 0.8440147638320923, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.775177993527508, "calib/avg_num_step_conf": 6.4921875, "calib/ece": 0.21102766798418982, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6561264822134387, "calib/gap": 0.37938576051779926, "calib/mean_conf": 0.7466798418972332, "calib/mu_c": 0.9011333333333332, "calib/mu_w": 0.521747572815534, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18241106719367597, "calib/std_conf": 0.38397242915676144, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3922387527839644, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.03183652765307432, "calib/step_q_w": 0.3604022251308901, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 499.80859375, "completions/mean_terminated_length": 501.7686462402344, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.19413333333333332, "grad_norm": 0.030112335458397865, "kl": 0.1181640625, "learning_rate": 5.000000000000001e-07, "loss": 0.0083, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03436057269573212, "mask/share_reasoning": 0.8254109025001526, "mask/share_step_conf": 0.1363222450017929, "num_tokens": 43146099.0, "reward": 0.7041236162185669, "reward_std": 0.22663047909736633, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7616550922393799, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.3317483961582184, "step": 182 }, { "adv/mean_abs_final_conf": 0.641716480255127, "adv/mean_abs_reasoning": 0.5711173415184021, "adv/mean_abs_step_conf": 0.7015622854232788, "adv/ratio_final_to_reasoning": 1.1236158204354754, "adv/ratio_step_to_reasoning": 1.2284030520909575, "adv/std_final_conf": 0.8584265112876892, "adv/std_reasoning": 0.8100204467773438, "adv/std_step_conf": 0.9068267345428467, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7253411306042885, "calib/avg_num_step_conf": 6.32421875, "calib/ece": 0.3023293172690763, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4859437751004016, "calib/gap": 0.30676608187134496, "calib/mean_conf": 0.5837751004016065, "calib/mu_c": 0.7242222222222222, "calib/mu_w": 0.41745614035087725, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17196787148594378, "calib/std_conf": 0.44907334937062315, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3710453474676089, "calib/step_q_c_n": 849.0, "calib/step_q_gap": 0.037227165649556915, "calib/step_q_w": 0.333818181818052, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 555.88671875, "completions/mean_terminated_length": 558.0667114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.1952, "grad_norm": 0.022334614768624306, "kl": 0.10772705078125, "learning_rate": 4.7222222222222226e-07, "loss": -0.0356, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03180518373847008, "mask/share_reasoning": 0.8399969339370728, "mask/share_step_conf": 0.12429161369800568, "num_tokens": 43395086.0, "reward": 0.6339738368988037, "reward_std": 0.25076961517333984, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6815133094787598, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2864344120025635, "step": 183 }, { "adv/mean_abs_final_conf": 0.648309588432312, "adv/mean_abs_reasoning": 0.5326626300811768, "adv/mean_abs_step_conf": 0.7190077900886536, "adv/ratio_final_to_reasoning": 1.2171110789835413, "adv/ratio_step_to_reasoning": 1.3498371191894543, "adv/std_final_conf": 0.8514952063560486, "adv/std_reasoning": 0.7928009629249573, "adv/std_step_conf": 0.9059350490570068, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7483679331505418, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.22909959839357422, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5943775100401606, "calib/gap": 0.38513407755581675, "calib/mean_conf": 0.6837180722891566, "calib/mu_c": 0.855404347826087, "calib/mu_w": 0.47027027027027024, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1793004016064256, "calib/std_conf": 0.4166708728237885, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3973337868480726, "calib/step_q_c_n": 882.0, "calib/step_q_gap": 0.03342225295984419, "calib/step_q_w": 0.3639115338882284, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 521.18359375, "completions/mean_terminated_length": 523.2274780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.19626666666666667, "grad_norm": 0.030016383156180382, "kl": 0.1154022216796875, "learning_rate": 4.444444444444445e-07, "loss": 0.0447, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033601127564907074, "mask/share_reasoning": 0.8270280361175537, "mask/share_step_conf": 0.1354646384716034, "num_tokens": 43633789.0, "reward": 0.6431786417961121, "reward_std": 0.2560381293296814, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7322711944580078, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.25174224376678467, "step": 184 }, { "adv/mean_abs_final_conf": 0.6456822156906128, "adv/mean_abs_reasoning": 0.5535425543785095, "adv/mean_abs_step_conf": 0.6214487552642822, "adv/ratio_final_to_reasoning": 1.1664545220295721, "adv/ratio_step_to_reasoning": 1.1226756648583494, "adv/std_final_conf": 0.8726965188980103, "adv/std_reasoning": 0.8098781704902649, "adv/std_step_conf": 0.8442644476890564, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8128409710856519, "calib/avg_num_step_conf": 7.09375, "calib/ece": 0.19886938775510207, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5265306122448979, "calib/gap": 0.46831307965084557, "calib/mean_conf": 0.6460285714285714, "calib/mu_c": 0.8448226950354609, "calib/mu_w": 0.37650961538461536, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13469387755102044, "calib/std_conf": 0.4324199810369544, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.384595603156708, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.1529379067089147, "calib/step_q_w": 0.2316576964477933, "calib/step_q_w_n": 929.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2850.0, "completions/max_terminated_length": 2850.0, "completions/mean_length": 491.75, "completions/mean_terminated_length": 501.54583740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.19733333333333333, "grad_norm": 0.027769120410084724, "kl": 0.12346649169921875, "learning_rate": 4.1666666666666667e-07, "loss": -0.0522, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.033237576484680176, "mask/share_reasoning": 0.8149456977844238, "mask/share_step_conf": 0.1322854906320572, "num_tokens": 43866597.0, "reward": 0.6342300176620483, "reward_std": 0.2558080554008484, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7545965313911438, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.21308214962482452, "step": 185 }, { "adv/mean_abs_final_conf": 0.6395155191421509, "adv/mean_abs_reasoning": 0.47967851161956787, "adv/mean_abs_step_conf": 0.7098360061645508, "adv/ratio_final_to_reasoning": 1.3332169435376948, "adv/ratio_step_to_reasoning": 1.4798161455427472, "adv/std_final_conf": 0.8394824862480164, "adv/std_reasoning": 0.7207673192024231, "adv/std_step_conf": 0.9067683815956116, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7708060288335516, "calib/avg_num_step_conf": 6.9296875, "calib/ece": 0.22248995983935738, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5582329317269076, "calib/gap": 0.4323394495412844, "calib/mean_conf": 0.6357429718875502, "calib/mu_c": 0.825, "calib/mu_w": 0.39266055045871556, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1479919678714859, "calib/std_conf": 0.4449046084762179, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3878627450980392, "calib/step_q_c_n": 867.0, "calib/step_q_gap": 0.08778534708260366, "calib/step_q_w": 0.30007739801543554, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 520.9296875, "completions/mean_terminated_length": 525.031494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.1984, "grad_norm": 0.030086809769272804, "kl": 0.116058349609375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0296, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034195803105831146, "mask/share_reasoning": 0.8220609426498413, "mask/share_step_conf": 0.13593077659606934, "num_tokens": 44104995.0, "reward": 0.6409979462623596, "reward_std": 0.2668594717979431, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.742479681968689, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2348286360502243, "step": 186 }, { "adv/mean_abs_final_conf": 0.6927512288093567, "adv/mean_abs_reasoning": 0.533842921257019, "adv/mean_abs_step_conf": 0.6895692348480225, "adv/ratio_final_to_reasoning": 1.2976686609951902, "adv/ratio_step_to_reasoning": 1.2917081174820502, "adv/std_final_conf": 0.884300708770752, "adv/std_reasoning": 0.7754393219947815, "adv/std_step_conf": 0.891502320766449, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7408947223749314, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.2505349794238684, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.588477366255144, "calib/gap": 0.3673096481583288, "calib/mean_conf": 0.6781893004115227, "calib/mu_c": 0.8399264705882353, "calib/mu_w": 0.47261682242990655, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18452674897119348, "calib/std_conf": 0.4225690743553865, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3916330858960763, "calib/step_q_c_n": 943.0, "calib/step_q_gap": 0.08050805881048584, "calib/step_q_w": 0.3111250270855905, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 552.69921875, "completions/mean_terminated_length": 559.2529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.19946666666666665, "grad_norm": 0.032365407794713974, "kl": 0.112884521484375, "learning_rate": 3.611111111111111e-07, "loss": 0.0046, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.031761303544044495, "mask/share_reasoning": 0.8223909139633179, "mask/share_step_conf": 0.13412901759147644, "num_tokens": 44348030.0, "reward": 0.6665481925010681, "reward_std": 0.2620869576931, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.704309344291687, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.33191192150115967, "step": 187 }, { "adv/mean_abs_final_conf": 0.642798125743866, "adv/mean_abs_reasoning": 0.5524102449417114, "adv/mean_abs_step_conf": 0.6499268412590027, "adv/ratio_final_to_reasoning": 1.1636245555360618, "adv/ratio_step_to_reasoning": 1.1765293044620866, "adv/std_final_conf": 0.8698856830596924, "adv/std_reasoning": 0.809790849685669, "adv/std_step_conf": 0.8761231899261475, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7898845447389137, "calib/avg_num_step_conf": 6.8984375, "calib/ece": 0.20147410358565732, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5657370517928287, "calib/gap": 0.461060089215429, "calib/mean_conf": 0.6721513944223106, "calib/mu_c": 0.8613513513513513, "calib/mu_w": 0.4002912621359223, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14199203187250992, "calib/std_conf": 0.42551322771472916, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42782907880133186, "calib/step_q_c_n": 901.0, "calib/step_q_gap": 0.08889266261636075, "calib/step_q_w": 0.3389364161849711, "calib/step_q_w_n": 865.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 559.65234375, "completions/mean_terminated_length": 559.65234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.20053333333333334, "grad_norm": 0.029699405655264854, "kl": 0.1065673828125, "learning_rate": 3.3333333333333335e-07, "loss": 0.0839, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031817786395549774, "mask/share_reasoning": 0.8325599431991577, "mask/share_step_conf": 0.13562226295471191, "num_tokens": 44595373.0, "reward": 0.7225303649902344, "reward_std": 0.2346128225326538, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7777925729751587, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.3555494546890259, "step": 188 }, { "adv/mean_abs_final_conf": 0.6673593521118164, "adv/mean_abs_reasoning": 0.4817875027656555, "adv/mean_abs_step_conf": 0.5264722108840942, "adv/ratio_final_to_reasoning": 1.3851736466406939, "adv/ratio_step_to_reasoning": 1.0927477526127813, "adv/std_final_conf": 0.8439661264419556, "adv/std_reasoning": 0.7393646240234375, "adv/std_step_conf": 0.7765308618545532, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7326504481434059, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.2553337301587302, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4444444444444444, "calib/gap": 0.3544060307298335, "calib/mean_conf": 0.5690313492063491, "calib/mu_c": 0.7237323943661972, "calib/mu_w": 0.3693263636363637, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13043650793650796, "calib/std_conf": 0.4417622195035688, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3589188514357054, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.041913891181333374, "calib/step_q_w": 0.317004960254372, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 481.99609375, "completions/mean_terminated_length": 481.99609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2016, "grad_norm": 0.021912207826972008, "kl": 0.12353515625, "learning_rate": 3.055555555555556e-07, "loss": -0.0996, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036062054336071014, "mask/share_reasoning": 0.8390330076217651, "mask/share_step_conf": 0.12490491569042206, "num_tokens": 44826532.0, "reward": 0.6127378344535828, "reward_std": 0.21219699084758759, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7217361927032471, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.19592693448066711, "step": 189 }, { "adv/mean_abs_final_conf": 0.6705529689788818, "adv/mean_abs_reasoning": 0.507400631904602, "adv/mean_abs_step_conf": 0.6579225063323975, "adv/ratio_final_to_reasoning": 1.3215453959169576, "adv/ratio_step_to_reasoning": 1.2966529108621518, "adv/std_final_conf": 0.8716249465942383, "adv/std_reasoning": 0.7753453254699707, "adv/std_step_conf": 0.8601924180984497, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8089280831282473, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.17622950819672134, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5286885245901639, "calib/gap": 0.5131063713426305, "calib/mean_conf": 0.6168032786885246, "calib/mu_c": 0.8397101449275362, "calib/mu_w": 0.3266037735849057, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11372950819672134, "calib/std_conf": 0.44692165006936685, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3571503487777866, "calib/step_q_c_n": 958.0, "calib/step_q_gap": 0.014972733206010502, "calib/step_q_w": 0.3421776155717761, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 566.40234375, "completions/mean_terminated_length": 573.1185913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 0.20266666666666666, "grad_norm": 0.04045505449175835, "kl": 0.1104736328125, "learning_rate": 2.7777777777777776e-07, "loss": -0.0769, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030456863343715668, "mask/share_reasoning": 0.8291221261024475, "mask/share_step_conf": 0.12870226800441742, "num_tokens": 45077139.0, "reward": 0.6780650019645691, "reward_std": 0.2419024556875229, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7663859128952026, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.29052531719207764, "step": 190 }, { "adv/mean_abs_final_conf": 0.7003530263900757, "adv/mean_abs_reasoning": 0.5041691660881042, "adv/mean_abs_step_conf": 0.5743429064750671, "adv/ratio_final_to_reasoning": 1.3891230830797932, "adv/ratio_step_to_reasoning": 1.139186894215383, "adv/std_final_conf": 0.8788830637931824, "adv/std_reasoning": 0.7393769025802612, "adv/std_step_conf": 0.8278172612190247, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6870734644720996, "calib/avg_num_step_conf": 7.0, "calib/ece": 0.33562753036437243, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5587044534412956, "calib/gap": 0.30695169276060497, "calib/mean_conf": 0.6642105263157895, "calib/mu_c": 0.839433962264151, "calib/mu_w": 0.532482269503546, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2853441295546559, "calib/std_conf": 0.42752569850846167, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42113530326594095, "calib/step_q_c_n": 643.0, "calib/step_q_gap": 0.11373678281337352, "calib/step_q_w": 0.30739852045256744, "calib/step_q_w_n": 1149.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 498.953125, "completions/mean_terminated_length": 504.8695983886719, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.20373333333333332, "grad_norm": 0.03345432132482529, "kl": 0.1206207275390625, "learning_rate": 2.5000000000000004e-07, "loss": -0.0279, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0342835932970047, "mask/share_reasoning": 0.8086893558502197, "mask/share_step_conf": 0.14530828595161438, "num_tokens": 45309039.0, "reward": 0.5651311278343201, "reward_std": 0.25212472677230835, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.6400132775306702, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.2152489423751831, "step": 191 }, { "adv/mean_abs_final_conf": 0.6397108435630798, "adv/mean_abs_reasoning": 0.4990270733833313, "adv/mean_abs_step_conf": 0.6980594396591187, "adv/ratio_final_to_reasoning": 1.281916107729252, "adv/ratio_step_to_reasoning": 1.3988408182473482, "adv/std_final_conf": 0.8631059527397156, "adv/std_reasoning": 0.757645308971405, "adv/std_step_conf": 0.8915175795555115, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7889655172413794, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.17080000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.4942528735632184, "calib/mean_conf": 0.612, "calib/mu_c": 0.8195862068965517, "calib/mu_w": 0.3253333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10140000000000003, "calib/std_conf": 0.4414757071459312, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.40529562647754136, "calib/step_q_c_n": 846.0, "calib/step_q_gap": 0.07728944409732497, "calib/step_q_w": 0.3280061823802164, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 527.40234375, "completions/mean_terminated_length": 529.4706420898438, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.2048, "grad_norm": 0.03467508777976036, "kl": 0.112152099609375, "learning_rate": 2.2222222222222224e-07, "loss": 0.0511, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03621545433998108, "mask/share_reasoning": 0.830856204032898, "mask/share_step_conf": 0.12902209162712097, "num_tokens": 45549030.0, "reward": 0.6882347464561462, "reward_std": 0.2760653495788574, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7746894359588623, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.29396748542785645, "step": 192 }, { "adv/mean_abs_final_conf": 0.7410750389099121, "adv/mean_abs_reasoning": 0.6519419550895691, "adv/mean_abs_step_conf": 0.635619044303894, "adv/ratio_final_to_reasoning": 1.1367193553421442, "adv/ratio_step_to_reasoning": 0.9749626317830206, "adv/std_final_conf": 0.9068481922149658, "adv/std_reasoning": 0.8589432835578918, "adv/std_step_conf": 0.8604073524475098, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7534294871794871, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.2312924, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.476, "calib/gap": 0.34936076923076925, "calib/mean_conf": 0.6116676, "calib/mu_c": 0.7793607692307692, "calib/mu_w": 0.43, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16147999999999996, "calib/std_conf": 0.4216746252150347, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3779138627187079, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.06123690798619763, "calib/step_q_w": 0.3166769547325103, "calib/step_q_w_n": 972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 529.48828125, "completions/mean_terminated_length": 533.657470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.20586666666666667, "grad_norm": 0.029216060414910316, "kl": 0.10797119140625, "learning_rate": 1.9444444444444447e-07, "loss": 0.0049, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03256699815392494, "mask/share_reasoning": 0.8358913064002991, "mask/share_step_conf": 0.12372923642396927, "num_tokens": 45790291.0, "reward": 0.6511830687522888, "reward_std": 0.2848384976387024, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7212778329849243, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2842133045196533, "step": 193 }, { "adv/mean_abs_final_conf": 0.6042866706848145, "adv/mean_abs_reasoning": 0.5227999687194824, "adv/mean_abs_step_conf": 0.6315984725952148, "adv/ratio_final_to_reasoning": 1.155865927392691, "adv/ratio_step_to_reasoning": 1.2081073266745166, "adv/std_final_conf": 0.8429195284843445, "adv/std_reasoning": 0.7754399180412292, "adv/std_step_conf": 0.8442825675010681, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8061941251596424, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.199203187250996, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5577689243027888, "calib/gap": 0.5051411238825032, "calib/mean_conf": 0.6288446215139443, "calib/mu_c": 0.8622962962962962, "calib/mu_w": 0.3571551724137931, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1450996015936255, "calib/std_conf": 0.4480420660931294, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4009491725768321, "calib/step_q_c_n": 846.0, "calib/step_q_gap": 0.05477458018552772, "calib/step_q_w": 0.3461745923913044, "calib/step_q_w_n": 736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 463.6953125, "completions/mean_terminated_length": 467.3464660644531, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.20693333333333333, "grad_norm": 0.02445780299603939, "kl": 0.11724853515625, "learning_rate": 1.6666666666666668e-07, "loss": -0.0587, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036237966269254684, "mask/share_reasoning": 0.8234450817108154, "mask/share_step_conf": 0.13250447809696198, "num_tokens": 46014941.0, "reward": 0.6347556710243225, "reward_std": 0.23542295396327972, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7780351638793945, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.18991366028785706, "step": 194 }, { "adv/mean_abs_final_conf": 0.6759068965911865, "adv/mean_abs_reasoning": 0.4473283290863037, "adv/mean_abs_step_conf": 0.6124850511550903, "adv/ratio_final_to_reasoning": 1.5109861205789692, "adv/ratio_step_to_reasoning": 1.36920693667251, "adv/std_final_conf": 0.8715158104896545, "adv/std_reasoning": 0.7206158638000488, "adv/std_step_conf": 0.8441517949104309, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7845496122489559, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.20072580645161292, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4879032258064516, "calib/gap": 0.47744813415523285, "calib/mean_conf": 0.5855645161290324, "calib/mu_c": 0.7915602836879432, "calib/mu_w": 0.3141121495327103, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1088709677419355, "calib/std_conf": 0.45161681305911217, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4234461009174312, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.09585865098418683, "calib/step_q_w": 0.32758744993324435, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 516.05859375, "completions/mean_terminated_length": 520.1220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 52.0, "epoch": 0.208, "grad_norm": 0.02788712829351425, "kl": 0.11358642578125, "learning_rate": 1.3888888888888888e-07, "loss": 0.0062, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034232839941978455, "mask/share_reasoning": 0.8219935894012451, "mask/share_step_conf": 0.13596110045909882, "num_tokens": 46253036.0, "reward": 0.6541599035263062, "reward_std": 0.2255578637123108, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7601671814918518, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.24424636363983154, "step": 195 }, { "adv/mean_abs_final_conf": 0.6024269461631775, "adv/mean_abs_reasoning": 0.48404091596603394, "adv/mean_abs_step_conf": 0.5885334014892578, "adv/ratio_final_to_reasoning": 1.2445785599774606, "adv/ratio_step_to_reasoning": 1.215875315653184, "adv/std_final_conf": 0.8332061171531677, "adv/std_reasoning": 0.7574361562728882, "adv/std_step_conf": 0.8277016282081604, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7445652173913042, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.24145669291338578, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6299212598425197, "calib/gap": 0.37064342828585706, "calib/mean_conf": 0.728700787401575, "calib/mu_c": 0.8979710144927535, "calib/mu_w": 0.5273275862068965, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21342519685039366, "calib/std_conf": 0.39535626941172886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4287546239210851, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.04077075295334315, "calib/step_q_w": 0.38798387096774195, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 414.421875, "completions/mean_terminated_length": 416.0470886230469, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.20906666666666668, "grad_norm": 0.02588522434234619, "kl": 0.13848876953125, "learning_rate": 1.1111111111111112e-07, "loss": 0.0036, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038649022579193115, "mask/share_reasoning": 0.8127016425132751, "mask/share_step_conf": 0.14474307000637054, "num_tokens": 46461672.0, "reward": 0.6552997827529907, "reward_std": 0.23187103867530823, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7393081784248352, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.26426011323928833, "step": 196 }, { "adv/mean_abs_final_conf": 0.6808929443359375, "adv/mean_abs_reasoning": 0.5449410080909729, "adv/mean_abs_step_conf": 0.6848366856575012, "adv/ratio_final_to_reasoning": 1.2494800982609637, "adv/ratio_step_to_reasoning": 1.256717104217589, "adv/std_final_conf": 0.8606191873550415, "adv/std_reasoning": 0.7929382920265198, "adv/std_step_conf": 0.891404926776886, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8103787673092588, "calib/avg_num_step_conf": 6.796875, "calib/ece": 0.202880658436214, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5473251028806584, "calib/gap": 0.44982079826228627, "calib/mean_conf": 0.6471604938271606, "calib/mu_c": 0.8618897637795276, "calib/mu_w": 0.41206896551724137, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16370370370370374, "calib/std_conf": 0.4252887701641552, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3990933080808081, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.06386018571793889, "calib/step_q_w": 0.33523312236286923, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 510.359375, "completions/mean_terminated_length": 520.5259399414062, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.21013333333333334, "grad_norm": 0.04117835313081741, "kl": 0.119720458984375, "learning_rate": 8.333333333333334e-08, "loss": -0.0563, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03287298232316971, "mask/share_reasoning": 0.8133430480957031, "mask/share_step_conf": 0.13425268232822418, "num_tokens": 46697380.0, "reward": 0.6645197868347168, "reward_std": 0.25330042839050293, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7390469312667847, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.30093011260032654, "step": 197 }, { "adv/mean_abs_final_conf": 0.5922660231590271, "adv/mean_abs_reasoning": 0.4888690710067749, "adv/mean_abs_step_conf": 0.6817275285720825, "adv/ratio_final_to_reasoning": 1.2115023393468458, "adv/ratio_step_to_reasoning": 1.3944992003035408, "adv/std_final_conf": 0.8186289072036743, "adv/std_reasoning": 0.7394116520881653, "adv/std_step_conf": 0.8603328466415405, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8124012118018968, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.20079999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.536, "calib/gap": 0.4844639093782929, "calib/mean_conf": 0.6236, "calib/mu_c": 0.8251369863013699, "calib/mu_w": 0.34067307692307697, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12020000000000002, "calib/std_conf": 0.4336872605922383, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.427626975763962, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.0563028650920252, "calib/step_q_w": 0.3713241106719368, "calib/step_q_w_n": 759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 444.66015625, "completions/mean_terminated_length": 449.9328308105469, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.2112, "grad_norm": 0.02617476135492325, "kl": 0.130615234375, "learning_rate": 5.555555555555556e-08, "loss": -0.0171, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03872863948345184, "mask/share_reasoning": 0.8002395629882812, "mask/share_step_conf": 0.1493130624294281, "num_tokens": 46916597.0, "reward": 0.6909536123275757, "reward_std": 0.21689258515834808, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7834550738334656, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2898584008216858, "step": 198 }, { "adv/mean_abs_final_conf": 0.658452033996582, "adv/mean_abs_reasoning": 0.5888506174087524, "adv/mean_abs_step_conf": 0.6658538579940796, "adv/ratio_final_to_reasoning": 1.1181987664275734, "adv/ratio_step_to_reasoning": 1.1307687184301194, "adv/std_final_conf": 0.844952404499054, "adv/std_reasoning": 0.8099378943443298, "adv/std_step_conf": 0.8760635852813721, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7493838148706011, "calib/avg_num_step_conf": 6.28125, "calib/ece": 0.2581069958847737, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5637860082304527, "calib/gap": 0.3322429138710119, "calib/mean_conf": 0.6967901234567903, "calib/mu_c": 0.845820895522388, "calib/mu_w": 0.5135779816513761, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2017283950617284, "calib/std_conf": 0.4023168309865329, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.38891882057716437, "calib/step_q_c_n": 797.0, "calib/step_q_gap": 0.07578688469553674, "calib/step_q_w": 0.31313193588162763, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 554.94140625, "completions/mean_terminated_length": 559.31103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.21226666666666666, "grad_norm": 0.03297751024365425, "kl": 0.1107940673828125, "learning_rate": 2.777777777777778e-08, "loss": 0.0314, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03619847819209099, "mask/share_reasoning": 0.8255935907363892, "mask/share_step_conf": 0.13039544224739075, "num_tokens": 47162862.0, "reward": 0.636322021484375, "reward_std": 0.2722158432006836, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.696749210357666, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.28136366605758667, "step": 199 }, { "adv/mean_abs_final_conf": 0.62433922290802, "adv/mean_abs_reasoning": 0.43767133355140686, "adv/mean_abs_step_conf": 0.5948544144630432, "adv/ratio_final_to_reasoning": 1.4265024346966695, "adv/ratio_step_to_reasoning": 1.3591349692386796, "adv/std_final_conf": 0.8479616641998291, "adv/std_reasoning": 0.7014275789260864, "adv/std_step_conf": 0.8276949524879456, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8478569126651627, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.16690079365079358, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5873015873015873, "calib/gap": 0.5570875281985175, "calib/mean_conf": 0.6611071428571428, "calib/mu_c": 0.8976482758620689, "calib/mu_w": 0.34056074766355143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1263055555555555, "calib/std_conf": 0.43937587457133875, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4216480503144654, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.08561692355390205, "calib/step_q_w": 0.33603112676056335, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 503.46484375, "completions/mean_terminated_length": 505.4392395019531, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.21333333333333335, "grad_norm": 0.04890147224068642, "kl": 0.11077880859375, "learning_rate": 0.0, "loss": 0.0363, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0350395031273365, "mask/share_reasoning": 0.8358122706413269, "mask/share_step_conf": 0.1252419799566269, "num_tokens": 47399797.0, "reward": 0.708634078502655, "reward_std": 0.22582747042179108, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.8145676851272583, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.29254424571990967, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.006349802596960216, "train_runtime": 14511.1788, "train_samples_per_second": 3.528, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47399797, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }