{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7493494749069214, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5704829604383013, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9337335228919983, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.042860016226768494, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.011, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8971271514892578, "reward_std": 0.1976315677165985, "rewards/accuracy_reward_step": 0.67578125, "rewards/asymmetric_l2_reward": 0.749505341053009, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.773115873336792, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5145629208746838, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9337809085845947, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04081178456544876, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0106, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.8363707661628723, "reward_std": 0.19354595243930817, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.7344152927398682, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "step": 2 }, { "adv/mean_abs_final_conf": 0.753760814666748, "adv/mean_abs_reasoning": 0.43374836444854736, "adv/mean_abs_step_conf": 0.751661479473114, "adv/ratio_final_to_reasoning": 1.737783647034735, "adv/ratio_step_to_reasoning": 1.7329436629202057, "adv/std_final_conf": 0.929868757724762, "adv/std_reasoning": 0.7013001441955566, "adv/std_step_conf": 0.9305136799812317, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5068113362541073, "calib/avg_num_step_conf": 5.0078125, "calib/ece": 0.22877952755905512, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3425196850393701, "calib/gap": 0.004660460021905566, "calib/mean_conf": 0.8794094488188977, "calib/mu_c": 0.881024096385542, "calib/mu_w": 0.8763636363636365, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22732283464566927, "calib/std_conf": 0.05409278327150863, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7904369538077403, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.023389136759923157, "calib/step_q_w": 0.7670478170478171, "calib/step_q_w_n": 481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 508.640625, "completions/mean_terminated_length": 510.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04966992139816284, "kl": 0.0011971145868301392, "learning_rate": 7.5e-07, "loss": 0.0326, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032452650368213654, "mask/share_reasoning": 0.8557740449905396, "mask/share_step_conf": 0.10786702483892441, "num_tokens": 694129.0, "reward": 0.9003467559814453, "reward_std": 0.16390666365623474, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.7596950531005859, "rewards/final_brier_reward_step": 0.7144359350204468, "rewards/format_reward_step": 0.984375, "step": 3 }, { "adv/mean_abs_final_conf": 0.7722760438919067, "adv/mean_abs_reasoning": 0.4103483259677887, "adv/mean_abs_step_conf": 0.7630362510681152, "adv/ratio_final_to_reasoning": 1.8820012048800911, "adv/ratio_step_to_reasoning": 1.859484254671997, "adv/std_final_conf": 0.9281506538391113, "adv/std_reasoning": 0.6815478205680847, "adv/std_step_conf": 0.9337376952171326, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4910992283741709, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.22568627450980389, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.22745098039215686, "calib/gap": 0.0012657371057265276, "calib/mean_conf": 0.8766666666666667, "calib/mu_c": 0.8771084337349399, "calib/mu_w": 0.8758426966292133, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22568627450980389, "calib/std_conf": 0.04235548285764873, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.798277108433735, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.006386749265493097, "calib/step_q_w": 0.7918903591682419, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 510.30078125, "completions/mean_terminated_length": 510.30078125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.004266666666666667, "grad_norm": 0.0446762815117836, "kl": 0.0002792179584503174, "learning_rate": 1.0000000000000002e-06, "loss": 0.046, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03316652029752731, "mask/share_reasoning": 0.8474521040916443, "mask/share_step_conf": 0.1193813905119896, "num_tokens": 930934.0, "reward": 0.8870111703872681, "reward_std": 0.16794714331626892, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.736449658870697, "rewards/final_brier_reward_step": 0.7102289199829102, "rewards/format_reward_step": 0.98828125, "step": 4 }, { "adv/mean_abs_final_conf": 0.7767199873924255, "adv/mean_abs_reasoning": 0.39009517431259155, "adv/mean_abs_step_conf": 0.7709915637969971, "adv/ratio_final_to_reasoning": 1.9911038088618427, "adv/ratio_step_to_reasoning": 1.9764191268338662, "adv/std_final_conf": 0.9301847219467163, "adv/std_reasoning": 0.6612535119056702, "adv/std_step_conf": 0.9337782263755798, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4286752080306432, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.33842105263157896, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.2874493927125506, "calib/gap": -0.011496499801875615, "calib/mean_conf": 0.880931174089069, "calib/mu_c": 0.8756716417910448, "calib/mu_w": 0.8871681415929205, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.33842105263157896, "calib/std_conf": 0.04485179508228513, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8001156069364163, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.007139828389703395, "calib/step_q_w": 0.7929757785467129, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 524.61328125, "completions/mean_terminated_length": 526.6705932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.005333333333333333, "grad_norm": 0.053633056581020355, "kl": 0.000286102294921875, "learning_rate": 1.25e-06, "loss": -0.0005, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03404708951711655, "mask/share_reasoning": 0.850751519203186, "mask/share_step_conf": 0.111295185983181, "num_tokens": 1171923.0, "reward": 0.7870633602142334, "reward_std": 0.16195642948150635, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6709086894989014, "rewards/final_brier_reward_step": 0.6063430309295654, "rewards/format_reward_step": 0.9609375, "step": 5 }, { "adv/mean_abs_final_conf": 0.7977774739265442, "adv/mean_abs_reasoning": 0.42455434799194336, "adv/mean_abs_step_conf": 0.7404891848564148, "adv/ratio_final_to_reasoning": 1.879093872668767, "adv/ratio_step_to_reasoning": 1.744156403906307, "adv/std_final_conf": 0.9312154054641724, "adv/std_reasoning": 0.6816370487213135, "adv/std_step_conf": 0.9339516758918762, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.433967112024666, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.3313545816733068, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.33067729083665337, "calib/gap": -0.009260662898253003, "calib/mean_conf": 0.8851394422310758, "calib/mu_c": 0.8810071942446042, "calib/mu_w": 0.8902678571428572, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3313545816733068, "calib/std_conf": 0.04471557388534855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7983999999999999, "calib/step_q_c_n": 675.0, "calib/step_q_gap": -0.011537597503900376, "calib/step_q_w": 0.8099375975039003, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 464.72265625, "completions/mean_terminated_length": 464.72265625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.0064, "grad_norm": 0.051193155348300934, "kl": 0.00040727853775024414, "learning_rate": 1.5e-06, "loss": 0.0986, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0360930897295475, "mask/share_reasoning": 0.8376978635787964, "mask/share_step_conf": 0.12620902061462402, "num_tokens": 1396844.0, "reward": 0.8131352663040161, "reward_std": 0.18004879355430603, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6967132091522217, "rewards/final_brier_reward_step": 0.6240886449813843, "rewards/format_reward_step": 0.98046875, "step": 6 }, { "adv/mean_abs_final_conf": 0.7584425210952759, "adv/mean_abs_reasoning": 0.47301214933395386, "adv/mean_abs_step_conf": 0.7564212083816528, "adv/ratio_final_to_reasoning": 1.60343137520513, "adv/ratio_step_to_reasoning": 1.5991580965663692, "adv/std_final_conf": 0.9305387735366821, "adv/std_reasoning": 0.7393888831138611, "adv/std_step_conf": 0.9338541030883789, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47629160284083, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.22399209486166, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.32806324110671936, "calib/gap": -0.002414705472775225, "calib/mean_conf": 0.8837549407114624, "calib/mu_c": 0.8829341317365269, "calib/mu_w": 0.8853488372093021, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22383399209486157, "calib/std_conf": 0.04383461580512651, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7935455635491607, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.0046248587033457245, "calib/step_q_w": 0.788920704845815, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 542.8203125, "completions/mean_terminated_length": 544.9490356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.007466666666666667, "grad_norm": 0.07613871246576309, "kl": 0.00029200315475463867, "learning_rate": 1.75e-06, "loss": 0.1122, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030497439205646515, "mask/share_reasoning": 0.8622984290122986, "mask/share_step_conf": 0.1032978817820549, "num_tokens": 1643230.0, "reward": 0.8966531753540039, "reward_std": 0.20346349477767944, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.757659375667572, "rewards/final_brier_reward_step": 0.7090843915939331, "rewards/format_reward_step": 0.98046875, "step": 7 }, { "adv/mean_abs_final_conf": 0.7677263617515564, "adv/mean_abs_reasoning": 0.44562456011772156, "adv/mean_abs_step_conf": 0.7679376602172852, "adv/ratio_final_to_reasoning": 1.722809805520469, "adv/ratio_step_to_reasoning": 1.7232839680434522, "adv/std_final_conf": 0.929553747177124, "adv/std_reasoning": 0.7014294862747192, "adv/std_step_conf": 0.9342193603515625, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5305101373446698, "calib/avg_num_step_conf": 4.59375, "calib/ece": 0.32694779116465855, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3092369477911647, "calib/gap": 0.004426422498364779, "calib/mean_conf": 0.8822891566265061, "calib/mu_c": 0.8842446043165467, "calib/mu_w": 0.8798181818181819, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3255020080321285, "calib/std_conf": 0.04544272094007264, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7964705882352942, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.03597698677277128, "calib/step_q_w": 0.7604936014625229, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 533.7578125, "completions/mean_terminated_length": 535.8510131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.008533333333333334, "grad_norm": 0.04478863254189491, "kl": 0.0004049241542816162, "learning_rate": 2.0000000000000003e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032160256057977676, "mask/share_reasoning": 0.8618191480636597, "mask/share_step_conf": 0.10211435705423355, "num_tokens": 1886384.0, "reward": 0.8338550925254822, "reward_std": 0.18050694465637207, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.7384202480316162, "rewards/final_brier_reward_step": 0.6269460916519165, "rewards/format_reward_step": 0.96875, "step": 8 }, { "adv/mean_abs_final_conf": 0.7801663875579834, "adv/mean_abs_reasoning": 0.43747538328170776, "adv/mean_abs_step_conf": 0.7640990614891052, "adv/ratio_final_to_reasoning": 1.7833377999593711, "adv/ratio_step_to_reasoning": 1.7466104166987415, "adv/std_final_conf": 0.9302932620048523, "adv/std_reasoning": 0.7014261484146118, "adv/std_step_conf": 0.9335131645202637, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.42748740982714034, "calib/avg_num_step_conf": 4.64453125, "calib/ece": 0.2562151394422311, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2908366533864542, "calib/gap": -0.010103443582414662, "calib/mean_conf": 0.880199203187251, "calib/mu_c": 0.8764556962025316, "calib/mu_w": 0.8865591397849463, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2534661354581674, "calib/std_conf": 0.044394556566673175, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7760684931506849, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.014957382039573863, "calib/step_q_w": 0.7611111111111111, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 511.79296875, "completions/mean_terminated_length": 513.800048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0096, "grad_norm": 0.042500849813222885, "kl": 0.000448763370513916, "learning_rate": 2.25e-06, "loss": 0.0164, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03407716751098633, "mask/share_reasoning": 0.8594886660575867, "mask/share_step_conf": 0.10252793878316879, "num_tokens": 2124939.0, "reward": 0.8477847576141357, "reward_std": 0.20172211527824402, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.7034733295440674, "rewards/final_brier_reward_step": 0.6749086380004883, "rewards/format_reward_step": 0.96875, "step": 9 }, { "adv/mean_abs_final_conf": 0.7686961889266968, "adv/mean_abs_reasoning": 0.46870675683021545, "adv/mean_abs_step_conf": 0.7512601017951965, "adv/ratio_final_to_reasoning": 1.6400364998474934, "adv/ratio_step_to_reasoning": 1.6028360821504721, "adv/std_final_conf": 0.9307101368904114, "adv/std_reasoning": 0.72056645154953, "adv/std_step_conf": 0.9345166087150574, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4519811320754717, "calib/avg_num_step_conf": 5.2890625, "calib/ece": 0.30464843750000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": -0.0036691823899371867, "calib/mean_conf": 0.8905859375, "calib/mu_c": 0.8890666666666666, "calib/mu_w": 0.8927358490566037, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30464843750000004, "calib/std_conf": 0.04735224178691115, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7855445544554456, "calib/step_q_c_n": 707.0, "calib/step_q_gap": -0.0018897577547554167, "calib/step_q_w": 0.787434312210201, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 516.8359375, "completions/mean_terminated_length": 518.86279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.010666666666666666, "grad_norm": 0.03771434351801872, "kl": 0.0006309151649475098, "learning_rate": 2.5e-06, "loss": 0.075, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03189965710043907, "mask/share_reasoning": 0.8523790836334229, "mask/share_step_conf": 0.11181497573852539, "num_tokens": 2364049.0, "reward": 0.8517932891845703, "reward_std": 0.19309264421463013, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7258471250534058, "rewards/final_brier_reward_step": 0.660551905632019, "rewards/format_reward_step": 1.0, "step": 10 }, { "adv/mean_abs_final_conf": 0.7453495264053345, "adv/mean_abs_reasoning": 0.3924379348754883, "adv/mean_abs_step_conf": 0.7527601718902588, "adv/ratio_final_to_reasoning": 1.8992800139003307, "adv/ratio_step_to_reasoning": 1.9181636253617853, "adv/std_final_conf": 0.9260469079017639, "adv/std_reasoning": 0.681506335735321, "adv/std_step_conf": 0.9340210556983948, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4787878787878788, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.30251968503937005, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4566929133858268, "calib/gap": -0.019564679048550082, "calib/mean_conf": 0.8861417322834646, "calib/mu_c": 0.878516129032258, "calib/mu_w": 0.898080808080808, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2892125984251968, "calib/std_conf": 0.10039329936618835, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7779198966408268, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.008329732706400583, "calib/step_q_w": 0.7695901639344263, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 531.82421875, "completions/mean_terminated_length": 531.82421875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.011733333333333333, "grad_norm": 0.038181111216545105, "kl": 0.001034379005432129, "learning_rate": 2.7500000000000004e-06, "loss": 0.103, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031716808676719666, "mask/share_reasoning": 0.8542863130569458, "mask/share_step_conf": 0.11399686336517334, "num_tokens": 2604676.0, "reward": 0.8596004247665405, "reward_std": 0.16910496354103088, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.7382351160049438, "rewards/final_brier_reward_step": 0.6614344120025635, "rewards/format_reward_step": 0.9921875, "step": 11 }, { "adv/mean_abs_final_conf": 0.7681692242622375, "adv/mean_abs_reasoning": 0.4303590655326843, "adv/mean_abs_step_conf": 0.7345424294471741, "adv/ratio_final_to_reasoning": 1.7849495590651097, "adv/ratio_step_to_reasoning": 1.7068129575427475, "adv/std_final_conf": 0.9277141094207764, "adv/std_reasoning": 0.7013878226280212, "adv/std_step_conf": 0.9326591491699219, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4637932279257816, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.2204761904761905, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": -0.002456140350877156, "calib/mean_conf": 0.8983333333333333, "calib/mu_c": 0.8975438596491229, "calib/mu_w": 0.9, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2201190476190476, "calib/std_conf": 0.05905673892705567, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7772737819025521, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.025649023682049243, "calib/step_q_w": 0.7516247582205029, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 467.9765625, "completions/mean_terminated_length": 471.6614074707031, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0128, "grad_norm": 0.06263236701488495, "kl": 0.003935456275939941, "learning_rate": 3e-06, "loss": 0.0278, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03686348348855972, "mask/share_reasoning": 0.8269122838973999, "mask/share_step_conf": 0.12841171026229858, "num_tokens": 2828654.0, "reward": 0.9123976230621338, "reward_std": 0.17215202748775482, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.7845866680145264, "rewards/final_brier_reward_step": 0.7113023400306702, "rewards/format_reward_step": 0.9765625, "step": 12 }, { "adv/mean_abs_final_conf": 0.7388582229614258, "adv/mean_abs_reasoning": 0.41692185401916504, "adv/mean_abs_step_conf": 0.7494971752166748, "adv/ratio_final_to_reasoning": 1.7721743675433765, "adv/ratio_step_to_reasoning": 1.7976922245534819, "adv/std_final_conf": 0.9258735775947571, "adv/std_reasoning": 0.7204419374465942, "adv/std_step_conf": 0.9338845610618591, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.510838445807771, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.26177865612648216, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5296442687747036, "calib/gap": 0.004040218132242646, "calib/mean_conf": 0.9060474308300396, "calib/mu_c": 0.9074846625766873, "calib/mu_w": 0.9034444444444446, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26177865612648216, "calib/std_conf": 0.04516318373782039, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.766610824742268, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.026143733447151263, "calib/step_q_w": 0.7404670912951168, "calib/step_q_w_n": 471.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 490.625, "completions/mean_terminated_length": 490.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.013866666666666666, "grad_norm": 0.04346369951963425, "kl": 0.002653837203979492, "learning_rate": 3.2500000000000002e-06, "loss": 0.0747, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0344499908387661, "mask/share_reasoning": 0.8513206243515015, "mask/share_step_conf": 0.11422930657863617, "num_tokens": 3058846.0, "reward": 0.9044943451881409, "reward_std": 0.173051655292511, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.7901186943054199, "rewards/final_brier_reward_step": 0.693869948387146, "rewards/format_reward_step": 0.98828125, "step": 13 }, { "adv/mean_abs_final_conf": 0.7576812505722046, "adv/mean_abs_reasoning": 0.5075523853302002, "adv/mean_abs_step_conf": 0.7558131217956543, "adv/ratio_final_to_reasoning": 1.4928138897018033, "adv/ratio_step_to_reasoning": 1.489133227704057, "adv/std_final_conf": 0.9261738061904907, "adv/std_reasoning": 0.7575864791870117, "adv/std_step_conf": 0.9350097179412842, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5146290491118077, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.39185483870967736, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7419354838709677, "calib/gap": 0.0006008359456635137, "calib/mean_conf": 0.9241129032258065, "calib/mu_c": 0.9243939393939393, "calib/mu_w": 0.9237931034482758, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.39185483870967736, "calib/std_conf": 0.03558298108905692, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7218681318681318, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.01843301736431502, "calib/step_q_w": 0.7034351145038168, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 563.546875, "completions/mean_terminated_length": 565.7568969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.014933333333333333, "grad_norm": 0.04784300550818443, "kl": 0.007000446319580078, "learning_rate": 3.5e-06, "loss": 0.0317, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03211609274148941, "mask/share_reasoning": 0.8483536243438721, "mask/share_step_conf": 0.11562406271696091, "num_tokens": 3308514.0, "reward": 0.8109242916107178, "reward_std": 0.2076244205236435, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.7517649531364441, "rewards/final_brier_reward_step": 0.5739898681640625, "rewards/format_reward_step": 0.96484375, "step": 14 }, { "adv/mean_abs_final_conf": 0.735247015953064, "adv/mean_abs_reasoning": 0.41596412658691406, "adv/mean_abs_step_conf": 0.7716025710105896, "adv/ratio_final_to_reasoning": 1.7675731366210423, "adv/ratio_step_to_reasoning": 1.854973834743334, "adv/std_final_conf": 0.9187384843826294, "adv/std_reasoning": 0.7012813687324524, "adv/std_step_conf": 0.9337195158004761, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49031918842014094, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.3992156862745099, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8901960784313725, "calib/gap": -0.0005752814549054852, "calib/mean_conf": 0.9364705882352942, "calib/mu_c": 0.9362043795620437, "calib/mu_w": 0.9367796610169492, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3992156862745099, "calib/std_conf": 0.030718425805495036, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6808519553072626, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.008267685644341216, "calib/step_q_w": 0.6725842696629214, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 473.62890625, "completions/mean_terminated_length": 475.4862976074219, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.016, "grad_norm": 0.03617151826620102, "kl": 0.010650634765625, "learning_rate": 3.7500000000000005e-06, "loss": -0.0205, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034178197383880615, "mask/share_reasoning": 0.8413941264152527, "mask/share_step_conf": 0.1205214262008667, "num_tokens": 3537643.0, "reward": 0.840351939201355, "reward_std": 0.1773625612258911, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.7859764099121094, "rewards/final_brier_reward_step": 0.588477373123169, "rewards/format_reward_step": 0.99609375, "step": 15 }, { "adv/mean_abs_final_conf": 0.7502319812774658, "adv/mean_abs_reasoning": 0.40567100048065186, "adv/mean_abs_step_conf": 0.7679750919342041, "adv/ratio_final_to_reasoning": 1.849360640490859, "adv/ratio_step_to_reasoning": 1.8930983260432293, "adv/std_final_conf": 0.9114437103271484, "adv/std_reasoning": 0.6816147565841675, "adv/std_step_conf": 0.9336596727371216, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5097402597402597, "calib/avg_num_step_conf": 6.546875, "calib/ece": 0.3347200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.932, "calib/gap": 0.0009997294372294796, "calib/mean_conf": 0.95072, "calib/mu_c": 0.9511038961038961, "calib/mu_w": 0.9501041666666666, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3347200000000001, "calib/std_conf": 0.027761152713819345, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6192934249263984, "calib/step_q_c_n": 1019.0, "calib/step_q_gap": 0.029019452323658768, "calib/step_q_w": 0.5902739726027396, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 663.50390625, "completions/mean_terminated_length": 668.7283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.017066666666666667, "grad_norm": 0.03823031485080719, "kl": 0.010951042175292969, "learning_rate": 4.000000000000001e-06, "loss": 0.0136, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02500973641872406, "mask/share_reasoning": 0.857978880405426, "mask/share_step_conf": 0.10919886827468872, "num_tokens": 3816348.0, "reward": 0.879867672920227, "reward_std": 0.17731714248657227, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.8082501888275146, "rewards/final_brier_reward_step": 0.6358601450920105, "rewards/format_reward_step": 0.9765625, "step": 16 }, { "adv/mean_abs_final_conf": 0.749430775642395, "adv/mean_abs_reasoning": 0.4871532618999481, "adv/mean_abs_step_conf": 0.7657231688499451, "adv/ratio_final_to_reasoning": 1.538388089036985, "adv/ratio_step_to_reasoning": 1.5718321701543072, "adv/std_final_conf": 0.9169648289680481, "adv/std_reasoning": 0.7576295733451843, "adv/std_step_conf": 0.9339720606803894, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5530997098793708, "calib/avg_num_step_conf": 5.8125, "calib/ece": 0.2446215139442231, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9243027888446215, "calib/gap": 0.0062330126736906966, "calib/mean_conf": 0.9498007968127489, "calib/mu_c": 0.951638418079096, "calib/mu_w": 0.9454054054054053, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2446215139442231, "calib/std_conf": 0.025459133712774463, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6141885325558795, "calib/step_q_c_n": 1029.0, "calib/step_q_gap": 0.04153058048616265, "calib/step_q_w": 0.5726579520697168, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 507.30078125, "completions/mean_terminated_length": 513.3162231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.018133333333333335, "grad_norm": 0.03812658414244652, "kl": 0.017798423767089844, "learning_rate": 4.25e-06, "loss": -0.0293, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032309580594301224, "mask/share_reasoning": 0.8335152864456177, "mask/share_step_conf": 0.12245635688304901, "num_tokens": 4049745.0, "reward": 0.9368460774421692, "reward_std": 0.2139730602502823, "rewards/accuracy_reward_step": 0.69140625, "rewards/asymmetric_l2_reward": 0.8321975469589233, "rewards/final_brier_reward_step": 0.7102445363998413, "rewards/format_reward_step": 0.96484375, "step": 17 }, { "adv/mean_abs_final_conf": 0.7235511541366577, "adv/mean_abs_reasoning": 0.4305734634399414, "adv/mean_abs_step_conf": 0.7692475318908691, "adv/ratio_final_to_reasoning": 1.680436012837522, "adv/ratio_step_to_reasoning": 1.7865651211878917, "adv/std_final_conf": 0.9193499088287354, "adv/std_reasoning": 0.720542848110199, "adv/std_step_conf": 0.9344537258148193, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.46025487350955774, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.4365476190476191, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": -0.00011040312914001316, "calib/mean_conf": 0.9563888888888888, "calib/mu_c": 0.9563358778625954, "calib/mu_w": 0.9564462809917355, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4365476190476191, "calib/std_conf": 0.029413168474644355, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5846951219512195, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.005368935121204288, "calib/step_q_w": 0.5793261868300152, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 508.76953125, "completions/mean_terminated_length": 512.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.0192, "grad_norm": 0.04069630801677704, "kl": 0.019733428955078125, "learning_rate": 4.5e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03226814791560173, "mask/share_reasoning": 0.8506003618240356, "mask/share_step_conf": 0.10931900143623352, "num_tokens": 4290710.0, "reward": 0.8182658553123474, "reward_std": 0.1954856812953949, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.7965109348297119, "rewards/final_brier_reward_step": 0.5423644781112671, "rewards/format_reward_step": 0.9765625, "step": 18 }, { "adv/mean_abs_final_conf": 0.7590962648391724, "adv/mean_abs_reasoning": 0.3977474570274353, "adv/mean_abs_step_conf": 0.7660222053527832, "adv/ratio_final_to_reasoning": 1.9084880404070375, "adv/ratio_step_to_reasoning": 1.925900950008954, "adv/std_final_conf": 0.9002686142921448, "adv/std_reasoning": 0.66129070520401, "adv/std_step_conf": 0.9337433576583862, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5557777777777777, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.3707843137254901, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9725490196078431, "calib/gap": 0.007238095238095044, "calib/mean_conf": 0.9590196078431373, "calib/mu_c": 0.9619999999999999, "calib/mu_w": 0.9547619047619048, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3707843137254901, "calib/std_conf": 0.027911267101634094, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5466435506241332, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.025147487632007248, "calib/step_q_w": 0.5214960629921259, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 484.01953125, "completions/mean_terminated_length": 485.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.020266666666666665, "grad_norm": 0.024780066683888435, "kl": 0.02925872802734375, "learning_rate": 4.75e-06, "loss": -0.0072, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03218923509120941, "mask/share_reasoning": 0.8538081645965576, "mask/share_step_conf": 0.11009633541107178, "num_tokens": 4519379.0, "reward": 0.8946607112884521, "reward_std": 0.16607698798179626, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8523170948028564, "rewards/final_brier_reward_step": 0.6205980777740479, "rewards/format_reward_step": 0.99609375, "step": 19 }, { "adv/mean_abs_final_conf": 0.7314550876617432, "adv/mean_abs_reasoning": 0.5331639051437378, "adv/mean_abs_step_conf": 0.7298775911331177, "adv/ratio_final_to_reasoning": 1.3719141161000898, "adv/ratio_step_to_reasoning": 1.3689553701808586, "adv/std_final_conf": 0.9152284860610962, "adv/std_reasoning": 0.7927942276000977, "adv/std_step_conf": 0.9345114827156067, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4764349489795918, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.41123015873015883, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9761904761904762, "calib/gap": -0.0030535714285715443, "calib/mean_conf": 0.964642857142857, "calib/mu_c": 0.9632857142857142, "calib/mu_w": 0.9663392857142857, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4101587301587303, "calib/std_conf": 0.02658229637795846, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5052864583333333, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.01378495683183173, "calib/step_q_w": 0.49150150150150157, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 500.95703125, "completions/mean_terminated_length": 504.9015808105469, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.021333333333333333, "grad_norm": 0.025546662509441376, "kl": 0.041919708251953125, "learning_rate": 5e-06, "loss": -0.0376, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0341811366379261, "mask/share_reasoning": 0.8281325697898865, "mask/share_step_conf": 0.12987381219863892, "num_tokens": 4752496.0, "reward": 0.8547559976577759, "reward_std": 0.21240723133087158, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.8308833837509155, "rewards/final_brier_reward_step": 0.5739409923553467, "rewards/format_reward_step": 0.9765625, "step": 20 }, { "adv/mean_abs_final_conf": 0.7029703855514526, "adv/mean_abs_reasoning": 0.4354853928089142, "adv/mean_abs_step_conf": 0.7637710571289062, "adv/ratio_final_to_reasoning": 1.6142226516881306, "adv/ratio_step_to_reasoning": 1.7538385207423937, "adv/std_final_conf": 0.8815454840660095, "adv/std_reasoning": 0.7205679416656494, "adv/std_step_conf": 0.9338224530220032, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47911710282844305, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.3549011857707511, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9802371541501976, "calib/gap": -0.002429949775310858, "calib/mean_conf": 0.9698418972332016, "calib/mu_c": 0.9689102564102563, "calib/mu_w": 0.9713402061855672, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35407114624505936, "calib/std_conf": 0.022649911297309762, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4717988826815642, "calib/step_q_c_n": 895.0, "calib/step_q_gap": 0.04224572397586318, "calib/step_q_w": 0.42955315870570104, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 512.80078125, "completions/mean_terminated_length": 514.811767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.0224, "grad_norm": 0.022558465600013733, "kl": 0.045940399169921875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0118, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0322754830121994, "mask/share_reasoning": 0.8363606333732605, "mask/share_step_conf": 0.1274576187133789, "num_tokens": 4986733.0, "reward": 0.9055733680725098, "reward_std": 0.18177592754364014, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.8619275093078613, "rewards/final_brier_reward_step": 0.6296879053115845, "rewards/format_reward_step": 0.98828125, "step": 21 }, { "adv/mean_abs_final_conf": 0.687179446220398, "adv/mean_abs_reasoning": 0.3393262028694153, "adv/mean_abs_step_conf": 0.7603123188018799, "adv/ratio_final_to_reasoning": 2.0251293310373937, "adv/ratio_step_to_reasoning": 2.2406531307412028, "adv/std_final_conf": 0.8886227607727051, "adv/std_reasoning": 0.6402140855789185, "adv/std_step_conf": 0.934212863445282, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.497718760640109, "calib/avg_num_step_conf": 6.17578125, "calib/ece": 0.3266929133858268, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": -0.0016397684712292637, "calib/mean_conf": 0.9719685039370078, "calib/mu_c": 0.9713939393939393, "calib/mu_w": 0.9730337078651685, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32452755905511815, "calib/std_conf": 0.024172587131458173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4599161616161616, "calib/step_q_c_n": 990.0, "calib/step_q_gap": -0.014872332461672633, "calib/step_q_w": 0.47478849407783424, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 505.25, "completions/mean_terminated_length": 507.2314147949219, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.023466666666666667, "grad_norm": 0.024227218702435493, "kl": 0.056705474853515625, "learning_rate": 4.944444444444445e-06, "loss": 0.0027, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031584907323122025, "mask/share_reasoning": 0.8346095681190491, "mask/share_step_conf": 0.1298992782831192, "num_tokens": 5217893.0, "reward": 0.9237887859344482, "reward_std": 0.15195703506469727, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8583118915557861, "rewards/final_brier_reward_step": 0.6619218587875366, "rewards/format_reward_step": 0.9921875, "step": 22 }, { "adv/mean_abs_final_conf": 0.7530167102813721, "adv/mean_abs_reasoning": 0.4509432315826416, "adv/mean_abs_step_conf": 0.74357008934021, "adv/ratio_final_to_reasoning": 1.6698703019414791, "adv/ratio_step_to_reasoning": 1.6489217206577373, "adv/std_final_conf": 0.897098183631897, "adv/std_reasoning": 0.7014564871788025, "adv/std_step_conf": 0.9347206950187683, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4533584431889517, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.4400395256916997, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": -0.0006911487758944901, "calib/mean_conf": 0.9727667984189724, "calib/mu_c": 0.9724444444444442, "calib/mu_w": 0.9731355932203387, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.439604743083004, "calib/std_conf": 0.02336031913958452, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.47111850865512644, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.015625136048351806, "calib/step_q_w": 0.45549337260677464, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 516.28125, "completions/mean_terminated_length": 520.346435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.024533333333333334, "grad_norm": 0.030233683064579964, "kl": 0.048114776611328125, "learning_rate": 4.9166666666666665e-06, "loss": -0.0494, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03315791115164757, "mask/share_reasoning": 0.8343464136123657, "mask/share_step_conf": 0.1246831864118576, "num_tokens": 5453997.0, "reward": 0.8372361660003662, "reward_std": 0.20087505877017975, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.8298656940460205, "rewards/final_brier_reward_step": 0.5430440902709961, "rewards/format_reward_step": 0.98046875, "step": 23 }, { "adv/mean_abs_final_conf": 0.7376492023468018, "adv/mean_abs_reasoning": 0.5325069427490234, "adv/mean_abs_step_conf": 0.7742867469787598, "adv/ratio_final_to_reasoning": 1.3852386572440694, "adv/ratio_step_to_reasoning": 1.4540406609190255, "adv/std_final_conf": 0.9101276993751526, "adv/std_reasoning": 0.7754067778587341, "adv/std_step_conf": 0.9344092011451721, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6002886002886003, "calib/avg_num_step_conf": 6.46484375, "calib/ece": 0.46437246963562745, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.006217368490095798, "calib/mean_conf": 0.974493927125506, "calib/mu_c": 0.9775396825396826, "calib/mu_w": 0.9713223140495868, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.46437246963562745, "calib/std_conf": 0.018606487581600245, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43772254335260113, "calib/step_q_c_n": 865.0, "calib/step_q_gap": -0.00044201360942414114, "calib/step_q_w": 0.4381645569620253, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 605.53125, "completions/mean_terminated_length": 605.53125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0256, "grad_norm": 0.7543313503265381, "kl": 1.2979049682617188, "learning_rate": 4.888888888888889e-06, "loss": 0.1109, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0299096517264843, "mask/share_reasoning": 0.8437927961349487, "mask/share_step_conf": 0.12629754841327667, "num_tokens": 5713525.0, "reward": 0.8208262324333191, "reward_std": 0.2088262289762497, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.831911027431488, "rewards/final_brier_reward_step": 0.5183351635932922, "rewards/format_reward_step": 0.96484375, "step": 24 }, { "adv/mean_abs_final_conf": 0.7134343385696411, "adv/mean_abs_reasoning": 0.4166892170906067, "adv/mean_abs_step_conf": 0.7519113421440125, "adv/ratio_final_to_reasoning": 1.7121497492806705, "adv/ratio_step_to_reasoning": 1.8044895603346358, "adv/std_final_conf": 0.8936386108398438, "adv/std_reasoning": 0.7013903260231018, "adv/std_step_conf": 0.934689462184906, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5044484580908423, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.36948000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": 0.0007445314067830999, "calib/mean_conf": 0.97348, "calib/mu_c": 0.973774834437086, "calib/mu_w": 0.9730303030303029, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36948000000000003, "calib/std_conf": 0.022169564722835676, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4669714285714286, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.04030949675324674, "calib/step_q_w": 0.42666193181818185, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 517.859375, "completions/mean_terminated_length": 517.859375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.02666666666666667, "grad_norm": 0.02401627041399479, "kl": 0.04943084716796875, "learning_rate": 4.861111111111111e-06, "loss": 0.0918, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03197113424539566, "mask/share_reasoning": 0.8365023136138916, "mask/share_step_conf": 0.13152649998664856, "num_tokens": 5949321.0, "reward": 0.8754016160964966, "reward_std": 0.18805429339408875, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.8299168944358826, "rewards/final_brier_reward_step": 0.6083863377571106, "rewards/format_reward_step": 0.97265625, "step": 25 }, { "adv/mean_abs_final_conf": 0.7160738706588745, "adv/mean_abs_reasoning": 0.406027227640152, "adv/mean_abs_step_conf": 0.7333802580833435, "adv/ratio_final_to_reasoning": 1.7636104721861319, "adv/ratio_step_to_reasoning": 1.806234183716648, "adv/std_final_conf": 0.8886821269989014, "adv/std_reasoning": 0.7012869715690613, "adv/std_step_conf": 0.9339894652366638, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.487475257661593, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.33404761904761915, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0007357859531772482, "calib/mean_conf": 0.972936507936508, "calib/mu_c": 0.9726708074534163, "calib/mu_w": 0.9734065934065935, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33404761904761915, "calib/std_conf": 0.01628454620025839, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47772833723653396, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.0363432021013988, "calib/step_q_w": 0.44138513513513516, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 528.06640625, "completions/mean_terminated_length": 528.06640625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.027733333333333332, "grad_norm": 0.02128530666232109, "kl": 0.05477142333984375, "learning_rate": 4.833333333333333e-06, "loss": 0.0559, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029835352674126625, "mask/share_reasoning": 0.8532722592353821, "mask/share_step_conf": 0.11689238250255585, "num_tokens": 6189746.0, "reward": 0.9097167253494263, "reward_std": 0.16467860341072083, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.8499466180801392, "rewards/final_brier_reward_step": 0.6468304395675659, "rewards/format_reward_step": 0.984375, "step": 26 }, { "adv/mean_abs_final_conf": 0.7680479288101196, "adv/mean_abs_reasoning": 0.5606542825698853, "adv/mean_abs_step_conf": 0.7543854713439941, "adv/ratio_final_to_reasoning": 1.3699136039585729, "adv/ratio_step_to_reasoning": 1.3455448300262656, "adv/std_final_conf": 0.9101540446281433, "adv/std_reasoning": 0.7928864359855652, "adv/std_step_conf": 0.9347414374351501, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4851177112066201, "calib/avg_num_step_conf": 6.8125, "calib/ece": 0.44495999999999986, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": 0.003723138110205859, "calib/mean_conf": 0.96464, "calib/mu_c": 0.9664122137404579, "calib/mu_w": 0.9626890756302521, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44279999999999986, "calib/std_conf": 0.06608532666182411, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49636254501800725, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.0435634231738799, "calib/step_q_w": 0.45279912184412735, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 556.5, "completions/mean_terminated_length": 558.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.0288, "grad_norm": 0.023293569684028625, "kl": 0.046802520751953125, "learning_rate": 4.805555555555556e-06, "loss": 0.0577, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030663229525089264, "mask/share_reasoning": 0.8329758644104004, "mask/share_step_conf": 0.13245464861392975, "num_tokens": 6437426.0, "reward": 0.8341401219367981, "reward_std": 0.22353267669677734, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.829703688621521, "rewards/final_brier_reward_step": 0.5409202575683594, "rewards/format_reward_step": 0.9765625, "step": 27 }, { "adv/mean_abs_final_conf": 0.7091407775878906, "adv/mean_abs_reasoning": 0.362392783164978, "adv/mean_abs_step_conf": 0.7690234184265137, "adv/ratio_final_to_reasoning": 1.9568291934363848, "adv/ratio_step_to_reasoning": 2.122071559235269, "adv/std_final_conf": 0.8629666566848755, "adv/std_reasoning": 0.6403860449790955, "adv/std_step_conf": 0.9344640970230103, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5548206937095825, "calib/avg_num_step_conf": 5.734375, "calib/ece": 0.29453815261044186, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.010019841269841345, "calib/mean_conf": 0.9612048192771084, "calib/mu_c": 0.9644642857142858, "calib/mu_w": 0.9544444444444444, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.29052208835341375, "calib/std_conf": 0.08898893379640163, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4770860215053763, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.05139828916336886, "calib/step_q_w": 0.42568773234200746, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 558.4375, "completions/mean_terminated_length": 558.4375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.029866666666666666, "grad_norm": 0.02454567328095436, "kl": 0.04253387451171875, "learning_rate": 4.777777777777778e-06, "loss": 0.0252, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.029856320470571518, "mask/share_reasoning": 0.8533777594566345, "mask/share_step_conf": 0.11676593124866486, "num_tokens": 6687330.0, "reward": 0.9220882058143616, "reward_std": 0.1710459291934967, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.842483401298523, "rewards/final_brier_reward_step": 0.6759117245674133, "rewards/format_reward_step": 0.97265625, "step": 28 }, { "adv/mean_abs_final_conf": 0.751983642578125, "adv/mean_abs_reasoning": 0.4891800880432129, "adv/mean_abs_step_conf": 0.755209743976593, "adv/ratio_final_to_reasoning": 1.5372327307641696, "adv/ratio_step_to_reasoning": 1.543827646373619, "adv/std_final_conf": 0.9133455753326416, "adv/std_reasoning": 0.7393599152565002, "adv/std_step_conf": 0.9341092705726624, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4830664633371458, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.48569721115537856, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": -0.007523827678230899, "calib/mean_conf": 0.9637848605577689, "calib/mu_c": 0.9599180327868853, "calib/mu_w": 0.9674418604651162, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.48171314741035864, "calib/std_conf": 0.06420535972185211, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45259562841530054, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.031010394973389588, "calib/step_q_w": 0.42158523344191096, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 585.25, "completions/mean_terminated_length": 589.8582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.030933333333333334, "grad_norm": 0.029458940029144287, "kl": 0.04529571533203125, "learning_rate": 4.75e-06, "loss": -0.0735, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027916226536035538, "mask/share_reasoning": 0.8425346612930298, "mask/share_step_conf": 0.12173663079738617, "num_tokens": 6944282.0, "reward": 0.8238710165023804, "reward_std": 0.20323438942432404, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.8522884845733643, "rewards/final_brier_reward_step": 0.5040472745895386, "rewards/format_reward_step": 0.98046875, "step": 29 }, { "adv/mean_abs_final_conf": 0.7376535534858704, "adv/mean_abs_reasoning": 0.4982551336288452, "adv/mean_abs_step_conf": 0.7502148747444153, "adv/ratio_final_to_reasoning": 1.4804735640423028, "adv/ratio_step_to_reasoning": 1.5056841848884133, "adv/std_final_conf": 0.9088844656944275, "adv/std_reasoning": 0.7576410174369812, "adv/std_step_conf": 0.9344900250434875, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.47781114447781114, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.40760162601626015, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9715447154471545, "calib/gap": 0.012942942942942848, "calib/mean_conf": 0.9563821138211382, "calib/mu_c": 0.9622222222222221, "calib/mu_w": 0.9492792792792792, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40760162601626015, "calib/std_conf": 0.0807153001610265, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42158899188876015, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.014454699322812903, "calib/step_q_w": 0.40713429256594724, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 626.84765625, "completions/mean_terminated_length": 631.783447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.032, "grad_norm": 0.02535308338701725, "kl": 0.0458221435546875, "learning_rate": 4.722222222222222e-06, "loss": 0.0568, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.026801906526088715, "mask/share_reasoning": 0.8470184803009033, "mask/share_step_conf": 0.11836712062358856, "num_tokens": 7211739.0, "reward": 0.844096839427948, "reward_std": 0.2074047029018402, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.8272979259490967, "rewards/final_brier_reward_step": 0.5632394552230835, "rewards/format_reward_step": 0.9609375, "step": 30 }, { "adv/mean_abs_final_conf": 0.7183291912078857, "adv/mean_abs_reasoning": 0.42496633529663086, "adv/mean_abs_step_conf": 0.7648087739944458, "adv/ratio_final_to_reasoning": 1.690320224321968, "adv/ratio_step_to_reasoning": 1.799692612029142, "adv/std_final_conf": 0.9244080185890198, "adv/std_reasoning": 0.7204815149307251, "adv/std_step_conf": 0.9345629215240479, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4812748493010132, "calib/avg_num_step_conf": 6.91015625, "calib/ece": 0.5126693227091633, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.952191235059761, "calib/gap": -0.007899833269205736, "calib/mean_conf": 0.9559362549800797, "calib/mu_c": 0.9515929203539825, "calib/mu_w": 0.9594927536231882, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.509203187250996, "calib/std_conf": 0.06133491690864156, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4562691131498471, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.06567853019020581, "calib/step_q_w": 0.3905905829596413, "calib/step_q_w_n": 1115.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 613.2578125, "completions/mean_terminated_length": 618.0866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.03306666666666667, "grad_norm": 0.030099138617515564, "kl": 0.04169464111328125, "learning_rate": 4.694444444444445e-06, "loss": 0.0082, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028120990842580795, "mask/share_reasoning": 0.8433920741081238, "mask/share_step_conf": 0.12067442387342453, "num_tokens": 7474645.0, "reward": 0.8082037568092346, "reward_std": 0.17442850768566132, "rewards/accuracy_reward_step": 0.44140625, "rewards/asymmetric_l2_reward": 0.8525465726852417, "rewards/final_brier_reward_step": 0.4794859290122986, "rewards/format_reward_step": 0.98046875, "step": 31 }, { "adv/mean_abs_final_conf": 0.72445148229599, "adv/mean_abs_reasoning": 0.49374479055404663, "adv/mean_abs_step_conf": 0.7507187128067017, "adv/ratio_final_to_reasoning": 1.4672589891694048, "adv/ratio_step_to_reasoning": 1.5204590046697941, "adv/std_final_conf": 0.9220708012580872, "adv/std_reasoning": 0.7753057479858398, "adv/std_step_conf": 0.9348052144050598, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6214615735016595, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.43745967741935476, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9032258064516129, "calib/gap": 0.02743476280340973, "calib/mean_conf": 0.9383467741935484, "calib/mu_c": 0.9517322834645667, "calib/mu_w": 0.924297520661157, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4318548387096774, "calib/std_conf": 0.11818899192469161, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4153371592539455, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.04276336940742842, "calib/step_q_w": 0.37257378984651707, "calib/step_q_w_n": 847.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 592.4609375, "completions/mean_terminated_length": 594.7843627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.034133333333333335, "grad_norm": 0.023633981123566628, "kl": 0.0494842529296875, "learning_rate": 4.666666666666667e-06, "loss": 0.0108, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029128411784768105, "mask/share_reasoning": 0.8510886430740356, "mask/share_step_conf": 0.1158766970038414, "num_tokens": 7733019.0, "reward": 0.8346266150474548, "reward_std": 0.2019689679145813, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.8270046710968018, "rewards/final_brier_reward_step": 0.5500609874725342, "rewards/format_reward_step": 0.96484375, "step": 32 }, { "adv/mean_abs_final_conf": 0.7848911285400391, "adv/mean_abs_reasoning": 0.4544700086116791, "adv/mean_abs_step_conf": 0.7658688426017761, "adv/ratio_final_to_reasoning": 1.7270471398932017, "adv/ratio_step_to_reasoning": 1.6851911635299375, "adv/std_final_conf": 0.9196609258651733, "adv/std_reasoning": 0.7013769149780273, "adv/std_step_conf": 0.9336271286010742, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5360434596838186, "calib/avg_num_step_conf": 6.3125, "calib/ece": 0.4496875000000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.91015625, "calib/gap": 0.008684612097906275, "calib/mean_conf": 0.94578125, "calib/mu_c": 0.9501574803149605, "calib/mu_w": 0.9414728682170542, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4496875000000001, "calib/std_conf": 0.061193971504041954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3878806333739343, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.009132834631795983, "calib/step_q_w": 0.3787477987421383, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 535.2109375, "completions/mean_terminated_length": 537.309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0352, "grad_norm": 0.021006744354963303, "kl": 0.052581787109375, "learning_rate": 4.638888888888889e-06, "loss": 0.0037, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03007342480123043, "mask/share_reasoning": 0.8435391187667847, "mask/share_step_conf": 0.12248119711875916, "num_tokens": 7976905.0, "reward": 0.8592232465744019, "reward_std": 0.16713036596775055, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.8708338141441345, "rewards/final_brier_reward_step": 0.548393726348877, "rewards/format_reward_step": 1.0, "step": 33 }, { "adv/mean_abs_final_conf": 0.7719642519950867, "adv/mean_abs_reasoning": 0.6268492937088013, "adv/mean_abs_step_conf": 0.7533676624298096, "adv/ratio_final_to_reasoning": 1.2314989579516022, "adv/ratio_step_to_reasoning": 1.2018321947408648, "adv/std_final_conf": 0.9304158091545105, "adv/std_reasoning": 0.8266779780387878, "adv/std_step_conf": 0.934239387512207, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5234611666129553, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.3810714285714285, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9087301587301587, "calib/gap": -0.0008849500483399941, "calib/mean_conf": 0.9373412698412698, "calib/mu_c": 0.9369655172413794, "calib/mu_w": 0.9378504672897194, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37150793650793645, "calib/std_conf": 0.08487429198324628, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.34746463547334067, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.025979621849362422, "calib/step_q_w": 0.32148501362397824, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 521.78515625, "completions/mean_terminated_length": 521.78515625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.03626666666666667, "grad_norm": 0.021358368918299675, "kl": 0.061359405517578125, "learning_rate": 4.611111111111112e-06, "loss": 0.0417, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03217000514268875, "mask/share_reasoning": 0.8295138478279114, "mask/share_step_conf": 0.13831612467765808, "num_tokens": 8215594.0, "reward": 0.8955637216567993, "reward_std": 0.2316317856311798, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.8735677003860474, "rewards/final_brier_reward_step": 0.6074035167694092, "rewards/format_reward_step": 0.984375, "step": 34 }, { "adv/mean_abs_final_conf": 0.7504846453666687, "adv/mean_abs_reasoning": 0.4689059257507324, "adv/mean_abs_step_conf": 0.7678923606872559, "adv/ratio_final_to_reasoning": 1.6005015167277323, "adv/ratio_step_to_reasoning": 1.6376256270548026, "adv/std_final_conf": 0.9261394143104553, "adv/std_reasoning": 0.7205601334571838, "adv/std_step_conf": 0.9343000054359436, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.593344965104686, "calib/avg_num_step_conf": 5.63671875, "calib/ece": 0.3975590551181102, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8188976377952756, "calib/gap": 0.032642073778663905, "calib/mean_conf": 0.9130708661417323, "calib/mu_c": 0.9282352941176469, "calib/mu_w": 0.895593220338983, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38759842519685034, "calib/std_conf": 0.1352309882195658, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3523180592991914, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.02979309496252952, "calib/step_q_w": 0.3225249643366619, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 582.58203125, "completions/mean_terminated_length": 582.58203125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.037333333333333336, "grad_norm": 0.022607261314988136, "kl": 0.05496978759765625, "learning_rate": 4.583333333333333e-06, "loss": -0.0247, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02826719731092453, "mask/share_reasoning": 0.8655970096588135, "mask/share_step_conf": 0.106135793030262, "num_tokens": 8473991.0, "reward": 0.8837124705314636, "reward_std": 0.18118566274642944, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8655635714530945, "rewards/final_brier_reward_step": 0.5979551076889038, "rewards/format_reward_step": 0.98828125, "step": 35 }, { "adv/mean_abs_final_conf": 0.7104417085647583, "adv/mean_abs_reasoning": 0.39619794487953186, "adv/mean_abs_step_conf": 0.7653812170028687, "adv/ratio_final_to_reasoning": 1.7931483939947632, "adv/ratio_step_to_reasoning": 1.9318152123065424, "adv/std_final_conf": 0.9201058745384216, "adv/std_reasoning": 0.7012619376182556, "adv/std_step_conf": 0.9333252906799316, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5465106897942719, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.21273809523809523, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8134920634920635, "calib/gap": 0.009519160951996675, "calib/mean_conf": 0.9189285714285714, "calib/mu_c": 0.9214594594594593, "calib/mu_w": 0.9119402985074626, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19876984126984126, "calib/std_conf": 0.1101119688844495, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3337616179001721, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.025334099472653693, "calib/step_q_w": 0.3084275184275184, "calib/step_q_w_n": 407.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 528.51171875, "completions/mean_terminated_length": 528.51171875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.0384, "grad_norm": 0.046310946345329285, "kl": 0.07181549072265625, "learning_rate": 4.555555555555556e-06, "loss": -0.0601, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03279054909944534, "mask/share_reasoning": 0.8338136672973633, "mask/share_step_conf": 0.13339582085609436, "num_tokens": 8712002.0, "reward": 0.9817196726799011, "reward_std": 0.14960414171218872, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.8716880679130554, "rewards/final_brier_reward_step": 0.7503449320793152, "rewards/format_reward_step": 0.984375, "step": 36 }, { "adv/mean_abs_final_conf": 0.7335127592086792, "adv/mean_abs_reasoning": 0.3861789107322693, "adv/mean_abs_step_conf": 0.7399336099624634, "adv/ratio_final_to_reasoning": 1.8994117462753166, "adv/ratio_step_to_reasoning": 1.9160383682252542, "adv/std_final_conf": 0.9107916951179504, "adv/std_reasoning": 0.6816076636314392, "adv/std_step_conf": 0.9339763522148132, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5764084980502892, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.47885714285714287, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8857142857142857, "calib/gap": 0.03537851284119908, "calib/mean_conf": 0.9215510204081632, "calib/mu_c": 0.9409009009009006, "calib/mu_w": 0.9055223880597015, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4736734693877551, "calib/std_conf": 0.13022759687852142, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3618576388888889, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.07817451131687242, "calib/step_q_w": 0.28368312757201647, "calib/step_q_w_n": 972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 562.7578125, "completions/mean_terminated_length": 569.4308471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.039466666666666664, "grad_norm": 0.04367915168404579, "kl": 0.06029510498046875, "learning_rate": 4.527777777777778e-06, "loss": -0.0503, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029208239167928696, "mask/share_reasoning": 0.842954158782959, "mask/share_step_conf": 0.11611886322498322, "num_tokens": 8963164.0, "reward": 0.8178337812423706, "reward_std": 0.1575174331665039, "rewards/accuracy_reward_step": 0.43359375, "rewards/asymmetric_l2_reward": 0.8471627235412598, "rewards/final_brier_reward_step": 0.5103796720504761, "rewards/format_reward_step": 0.95703125, "step": 37 }, { "adv/mean_abs_final_conf": 0.7438644766807556, "adv/mean_abs_reasoning": 0.3514899015426636, "adv/mean_abs_step_conf": 0.7278769612312317, "adv/ratio_final_to_reasoning": 2.1163182026453353, "adv/ratio_step_to_reasoning": 2.0708332103899223, "adv/std_final_conf": 0.9110886454582214, "adv/std_reasoning": 0.6403597593307495, "adv/std_step_conf": 0.9339098930358887, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6162587412587412, "calib/avg_num_step_conf": 6.0, "calib/ece": 0.4017670682730922, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8192771084337349, "calib/gap": 0.022062937062936938, "calib/mean_conf": 0.9255421686746987, "calib/mu_c": 0.9359090909090908, "calib/mu_w": 0.9138461538461539, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39859437751004, "calib/std_conf": 0.07661387585031114, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33544607190412784, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.037586199292662825, "calib/step_q_w": 0.297859872611465, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 579.6796875, "completions/mean_terminated_length": 581.9530029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.04053333333333333, "grad_norm": 0.021768247708678246, "kl": 0.060333251953125, "learning_rate": 4.5e-06, "loss": 0.0435, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03031359612941742, "mask/share_reasoning": 0.8476117849349976, "mask/share_step_conf": 0.11816837638616562, "num_tokens": 9218450.0, "reward": 0.8697977066040039, "reward_std": 0.14932399988174438, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.8586658239364624, "rewards/final_brier_reward_step": 0.5832734107971191, "rewards/format_reward_step": 0.97265625, "step": 38 }, { "adv/mean_abs_final_conf": 0.7321931719779968, "adv/mean_abs_reasoning": 0.4579048156738281, "adv/mean_abs_step_conf": 0.7451863884925842, "adv/ratio_final_to_reasoning": 1.5990073633546322, "adv/ratio_step_to_reasoning": 1.6273827288669327, "adv/std_final_conf": 0.9224193692207336, "adv/std_reasoning": 0.7205734252929688, "adv/std_step_conf": 0.9335898756980896, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6688839615668883, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.38768627450980386, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7725490196078432, "calib/gap": 0.05266075388026592, "calib/mean_conf": 0.8949019607843137, "calib/mu_c": 0.9203030303030303, "calib/mu_w": 0.8676422764227644, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3824705882352941, "calib/std_conf": 0.16491676509051476, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3508021390374331, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.03972902582988591, "calib/step_q_w": 0.3110731132075472, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 546.03125, "completions/mean_terminated_length": 546.03125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.0416, "grad_norm": 0.028995206579566002, "kl": 0.0600128173828125, "learning_rate": 4.472222222222223e-06, "loss": -0.0347, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030841421335935593, "mask/share_reasoning": 0.8439503908157349, "mask/share_step_conf": 0.12520815432071686, "num_tokens": 9464322.0, "reward": 0.8865332007408142, "reward_std": 0.1797136813402176, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.8740624785423279, "rewards/final_brier_reward_step": 0.5990039110183716, "rewards/format_reward_step": 0.984375, "step": 39 }, { "adv/mean_abs_final_conf": 0.7255151867866516, "adv/mean_abs_reasoning": 0.5067353844642639, "adv/mean_abs_step_conf": 0.7520684003829956, "adv/ratio_final_to_reasoning": 1.4317436852247616, "adv/ratio_step_to_reasoning": 1.4841442366968418, "adv/std_final_conf": 0.9328517913818359, "adv/std_reasoning": 0.7926381826400757, "adv/std_step_conf": 0.9337299466133118, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4924899446958271, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.45719367588932813, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7075098814229249, "calib/gap": -0.004004524886877858, "calib/mean_conf": 0.9013833992094862, "calib/mu_c": 0.8992307692307693, "calib/mu_w": 0.9032352941176471, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4480632411067194, "calib/std_conf": 0.12519101311172237, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35236206896551725, "calib/step_q_c_n": 580.0, "calib/step_q_gap": 0.020086619863720845, "calib/step_q_w": 0.3322754491017964, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 570.9453125, "completions/mean_terminated_length": 573.184326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.042666666666666665, "grad_norm": 0.029202446341514587, "kl": 0.06134796142578125, "learning_rate": 4.444444444444444e-06, "loss": -0.0549, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030164752155542374, "mask/share_reasoning": 0.8536300659179688, "mask/share_step_conf": 0.11229896545410156, "num_tokens": 9717244.0, "reward": 0.8443441390991211, "reward_std": 0.1785648614168167, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.8648823499679565, "rewards/final_brier_reward_step": 0.5347433686256409, "rewards/format_reward_step": 0.98828125, "step": 40 }, { "adv/mean_abs_final_conf": 0.7701693773269653, "adv/mean_abs_reasoning": 0.47483962774276733, "adv/mean_abs_step_conf": 0.7666299343109131, "adv/ratio_final_to_reasoning": 1.6219568299050762, "adv/ratio_step_to_reasoning": 1.6145028542693913, "adv/std_final_conf": 0.9186797142028809, "adv/std_reasoning": 0.7206448316574097, "adv/std_step_conf": 0.9336923360824585, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6092863894139887, "calib/avg_num_step_conf": 5.66015625, "calib/ece": 0.19758893280632403, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6719367588932806, "calib/gap": 0.041159420289854975, "calib/mean_conf": 0.8674703557312253, "calib/mu_c": 0.8786956521739129, "calib/mu_w": 0.8375362318840579, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.168893280632411, "calib/std_conf": 0.1793265952873243, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3456231599607458, "calib/step_q_c_n": 1019.0, "calib/step_q_gap": 0.016971764611908524, "calib/step_q_w": 0.32865139534883725, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 526.03515625, "completions/mean_terminated_length": 526.03515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.04373333333333333, "grad_norm": 0.03552259877324104, "kl": 0.05785369873046875, "learning_rate": 4.416666666666667e-06, "loss": 0.0089, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03249321132898331, "mask/share_reasoning": 0.8456205129623413, "mask/share_step_conf": 0.121886245906353, "num_tokens": 9959157.0, "reward": 0.9845508337020874, "reward_std": 0.17637822031974792, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.8705066442489624, "rewards/final_brier_reward_step": 0.7571886777877808, "rewards/format_reward_step": 0.98828125, "step": 41 }, { "adv/mean_abs_final_conf": 0.7168235778808594, "adv/mean_abs_reasoning": 0.3787510395050049, "adv/mean_abs_step_conf": 0.7346171736717224, "adv/ratio_final_to_reasoning": 1.8925983115919267, "adv/ratio_step_to_reasoning": 1.9395779735200305, "adv/std_final_conf": 0.9166406393051147, "adv/std_reasoning": 0.6814785599708557, "adv/std_step_conf": 0.9328687787055969, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.630614352090862, "calib/avg_num_step_conf": 6.328125, "calib/ece": 0.3178656126482214, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6758893280632411, "calib/gap": 0.05375709860609179, "calib/mean_conf": 0.8887747035573121, "calib/mu_c": 0.9108724832214763, "calib/mu_w": 0.8571153846153845, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3088537549407115, "calib/std_conf": 0.1539560041628695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3585111111111111, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.04060833333333336, "calib/step_q_w": 0.31790277777777776, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 480.5859375, "completions/mean_terminated_length": 482.4706115722656, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.0448, "grad_norm": 0.02795771323144436, "kl": 0.0621185302734375, "learning_rate": 4.388888888888889e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032921478152275085, "mask/share_reasoning": 0.8269742727279663, "mask/share_step_conf": 0.1361980140209198, "num_tokens": 10186555.0, "reward": 0.9259449243545532, "reward_std": 0.14228272438049316, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8753531575202942, "rewards/final_brier_reward_step": 0.6624742150306702, "rewards/format_reward_step": 0.98828125, "step": 42 }, { "adv/mean_abs_final_conf": 0.7828581929206848, "adv/mean_abs_reasoning": 0.5349443554878235, "adv/mean_abs_step_conf": 0.7610698938369751, "adv/ratio_final_to_reasoning": 1.4634385518598194, "adv/ratio_step_to_reasoning": 1.4227085229134617, "adv/std_final_conf": 0.930379331111908, "adv/std_reasoning": 0.7753865122795105, "adv/std_step_conf": 0.9323244690895081, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6944133383103148, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.25788235294117645, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4980392156862745, "calib/gap": 0.1379805897723031, "calib/mean_conf": 0.8031372549019609, "calib/mu_c": 0.8648226950354609, "calib/mu_w": 0.7268421052631578, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2540392156862744, "calib/std_conf": 0.22322669560713412, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3525706940874036, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.020271481488978393, "calib/step_q_w": 0.3322992125984252, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 518.61328125, "completions/mean_terminated_length": 518.61328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.04586666666666667, "grad_norm": 0.030341370031237602, "kl": 0.0573577880859375, "learning_rate": 4.361111111111112e-06, "loss": 0.0182, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03317509591579437, "mask/share_reasoning": 0.8494628667831421, "mask/share_step_conf": 0.11736202985048294, "num_tokens": 10424544.0, "reward": 0.9504603743553162, "reward_std": 0.17397907376289368, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.885722279548645, "rewards/final_brier_reward_step": 0.7058234810829163, "rewards/format_reward_step": 0.99609375, "step": 43 }, { "adv/mean_abs_final_conf": 0.77141273021698, "adv/mean_abs_reasoning": 0.5115665197372437, "adv/mean_abs_step_conf": 0.7307957410812378, "adv/ratio_final_to_reasoning": 1.5079421745840627, "adv/ratio_step_to_reasoning": 1.4285448966765788, "adv/std_final_conf": 0.9244205355644226, "adv/std_reasoning": 0.7575879096984863, "adv/std_step_conf": 0.9330092072486877, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6712398373983739, "calib/avg_num_step_conf": 6.0703125, "calib/ece": 0.33601593625498005, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5179282868525896, "calib/gap": 0.1214399136178862, "calib/mean_conf": 0.8069322709163347, "calib/mu_c": 0.8688617886178862, "calib/mu_w": 0.747421875, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32645418326693226, "calib/std_conf": 0.2188446468993844, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.37137755102040815, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.04472820037105746, "calib/step_q_w": 0.3266493506493507, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 587.4921875, "completions/mean_terminated_length": 589.796142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 250.0, "epoch": 0.046933333333333334, "grad_norm": 0.03289037570357323, "kl": 0.049488067626953125, "learning_rate": 4.333333333333334e-06, "loss": -0.061, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027674272656440735, "mask/share_reasoning": 0.8532435894012451, "mask/share_step_conf": 0.11517593264579773, "num_tokens": 10681262.0, "reward": 0.9121721982955933, "reward_std": 0.17224377393722534, "rewards/accuracy_reward_step": 0.48046875, "rewards/asymmetric_l2_reward": 0.8826147317886353, "rewards/final_brier_reward_step": 0.6495422124862671, "rewards/format_reward_step": 0.98046875, "step": 44 }, { "adv/mean_abs_final_conf": 0.726306140422821, "adv/mean_abs_reasoning": 0.43901243805885315, "adv/mean_abs_step_conf": 0.7687171101570129, "adv/ratio_final_to_reasoning": 1.6544090268473302, "adv/ratio_step_to_reasoning": 1.7510144212678553, "adv/std_final_conf": 0.9306395053863525, "adv/std_reasoning": 0.7204497456550598, "adv/std_step_conf": 0.9319562315940857, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6645899554990464, "calib/avg_num_step_conf": 6.30078125, "calib/ece": 0.2612252964426878, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5375494071146245, "calib/gap": 0.12187412587412572, "calib/mean_conf": 0.7992490118577076, "calib/mu_c": 0.8522377622377623, "calib/mu_w": 0.7303636363636365, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24762845849802378, "calib/std_conf": 0.23942253743503325, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3627835051546392, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.02529701866815276, "calib/step_q_w": 0.33748648648648644, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 542.84765625, "completions/mean_terminated_length": 544.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.048, "grad_norm": 0.031237227842211723, "kl": 0.054935455322265625, "learning_rate": 4.305555555555556e-06, "loss": -0.0223, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031349748373031616, "mask/share_reasoning": 0.8371763229370117, "mask/share_step_conf": 0.12756764888763428, "num_tokens": 10925279.0, "reward": 0.9448119401931763, "reward_std": 0.15327224135398865, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.8864164352416992, "rewards/final_brier_reward_step": 0.6938323974609375, "rewards/format_reward_step": 0.98828125, "step": 45 }, { "adv/mean_abs_final_conf": 0.7354565858840942, "adv/mean_abs_reasoning": 0.40667724609375, "adv/mean_abs_step_conf": 0.7561466693878174, "adv/ratio_final_to_reasoning": 1.8084527544837161, "adv/ratio_step_to_reasoning": 1.8593286854645055, "adv/std_final_conf": 0.919183075428009, "adv/std_reasoning": 0.6816875338554382, "adv/std_step_conf": 0.9330542087554932, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6204044117647058, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.283508064516129, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5362903225806451, "calib/gap": 0.09378676470588243, "calib/mean_conf": 0.7820564516129033, "calib/mu_c": 0.8244117647058824, "calib/mu_w": 0.730625, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2585887096774193, "calib/std_conf": 0.2628921894329086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35542750929368033, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.02335401848003199, "calib/step_q_w": 0.33207349081364834, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 589.328125, "completions/mean_terminated_length": 589.328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.04906666666666667, "grad_norm": 0.0332394540309906, "kl": 0.04784393310546875, "learning_rate": 4.277777777777778e-06, "loss": -0.0168, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032339829951524734, "mask/share_reasoning": 0.8446968197822571, "mask/share_step_conf": 0.12296333909034729, "num_tokens": 11180915.0, "reward": 0.9091041088104248, "reward_std": 0.16267403960227966, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8642227053642273, "rewards/final_brier_reward_step": 0.6539855599403381, "rewards/format_reward_step": 0.96875, "step": 46 }, { "adv/mean_abs_final_conf": 0.7560012340545654, "adv/mean_abs_reasoning": 0.48656049370765686, "adv/mean_abs_step_conf": 0.7640652656555176, "adv/ratio_final_to_reasoning": 1.553766168506065, "adv/ratio_step_to_reasoning": 1.5703397121974223, "adv/std_final_conf": 0.9190072417259216, "adv/std_reasoning": 0.7393056750297546, "adv/std_step_conf": 0.932059109210968, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7265749601275917, "calib/avg_num_step_conf": 6.23828125, "calib/ece": 0.1629083665338646, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.47808764940239046, "calib/gap": 0.20046318447634248, "calib/mean_conf": 0.7527091633466135, "calib/mu_c": 0.8317763157894738, "calib/mu_w": 0.6313131313131313, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15501992031872514, "calib/std_conf": 0.25110399877916256, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.35576086956521746, "calib/step_q_c_n": 920.0, "calib/step_q_gap": 0.020487605163444944, "calib/step_q_w": 0.3352732644017725, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 584.73828125, "completions/mean_terminated_length": 589.342529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.050133333333333335, "grad_norm": 0.05675409361720085, "kl": 0.051654815673828125, "learning_rate": 4.25e-06, "loss": -0.0528, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028933309018611908, "mask/share_reasoning": 0.8470029830932617, "mask/share_step_conf": 0.11625122278928757, "num_tokens": 11436584.0, "reward": 0.9713031053543091, "reward_std": 0.14311102032661438, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.87469881772995, "rewards/final_brier_reward_step": 0.7538449168205261, "rewards/format_reward_step": 0.9765625, "step": 47 }, { "adv/mean_abs_final_conf": 0.7458685040473938, "adv/mean_abs_reasoning": 0.45874661207199097, "adv/mean_abs_step_conf": 0.7546839118003845, "adv/ratio_final_to_reasoning": 1.625883405827409, "adv/ratio_step_to_reasoning": 1.645099695432633, "adv/std_final_conf": 0.9048066139221191, "adv/std_reasoning": 0.7014490365982056, "adv/std_step_conf": 0.9324750304222107, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6762935450819672, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.2634400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.15932248975409835, "calib/mean_conf": 0.74936, "calib/mu_c": 0.827109375, "calib/mu_w": 0.6677868852459017, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.25040000000000007, "calib/std_conf": 0.2668115259879153, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.38141732283464563, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.009236767279090063, "calib/step_q_w": 0.37218055555555557, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 515.4296875, "completions/mean_terminated_length": 517.4509887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0512, "grad_norm": 0.041724901646375656, "kl": 0.057010650634765625, "learning_rate": 4.222222222222223e-06, "loss": -0.0685, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033603981137275696, "mask/share_reasoning": 0.8387103080749512, "mask/share_step_conf": 0.12377943843603134, "num_tokens": 11672222.0, "reward": 0.919036865234375, "reward_std": 0.13520202040672302, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.8569885492324829, "rewards/final_brier_reward_step": 0.6857726573944092, "rewards/format_reward_step": 0.9765625, "step": 48 }, { "adv/mean_abs_final_conf": 0.7114934325218201, "adv/mean_abs_reasoning": 0.46754950284957886, "adv/mean_abs_step_conf": 0.7762830257415771, "adv/ratio_final_to_reasoning": 1.5217499498672837, "adv/ratio_step_to_reasoning": 1.66032264179591, "adv/std_final_conf": 0.868726372718811, "adv/std_reasoning": 0.7015178799629211, "adv/std_step_conf": 0.9322298765182495, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.682031040941932, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.28253012048192777, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.714859437751004, "calib/gap": 0.136437650521809, "calib/mean_conf": 0.8418875502008032, "calib/mu_c": 0.8972297297297297, "calib/mu_w": 0.7607920792079207, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2650200803212852, "calib/std_conf": 0.23948583521095618, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4151843043995244, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.04015183686705692, "calib/step_q_w": 0.3750324675324675, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 529.74609375, "completions/mean_terminated_length": 531.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.05226666666666667, "grad_norm": 0.04617791995406151, "kl": 0.058994293212890625, "learning_rate": 4.194444444444445e-06, "loss": -0.0631, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03090585768222809, "mask/share_reasoning": 0.8450629711151123, "mask/share_step_conf": 0.12012490630149841, "num_tokens": 11912373.0, "reward": 0.9283610582351685, "reward_std": 0.18739992380142212, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8635416626930237, "rewards/final_brier_reward_step": 0.6838054656982422, "rewards/format_reward_step": 0.96875, "step": 49 }, { "adv/mean_abs_final_conf": 0.6450693607330322, "adv/mean_abs_reasoning": 0.384204626083374, "adv/mean_abs_step_conf": 0.7576757073402405, "adv/ratio_final_to_reasoning": 1.6789734348306609, "adv/ratio_step_to_reasoning": 1.972062947456082, "adv/std_final_conf": 0.846005380153656, "adv/std_reasoning": 0.6612586975097656, "adv/std_step_conf": 0.9314751625061035, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7366114230927269, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.24157894736842106, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.680161943319838, "calib/gap": 0.14676003287220918, "calib/mean_conf": 0.8274089068825912, "calib/mu_c": 0.8856375838926173, "calib/mu_w": 0.7388775510204081, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2328744939271255, "calib/std_conf": 0.24239478038170492, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43133171912832935, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.046189314065038234, "calib/step_q_w": 0.3851424050632911, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2149.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 552.40234375, "completions/mean_terminated_length": 558.9525756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.05333333333333334, "grad_norm": 0.07475942373275757, "kl": 0.048542022705078125, "learning_rate": 4.166666666666667e-06, "loss": -0.0752, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030771994963288307, "mask/share_reasoning": 0.8386844396591187, "mask/share_step_conf": 0.118824802339077, "num_tokens": 12159148.0, "reward": 0.9325626492500305, "reward_std": 0.1499963104724884, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.863436222076416, "rewards/final_brier_reward_step": 0.693095326423645, "rewards/format_reward_step": 0.9609375, "step": 50 }, { "adv/mean_abs_final_conf": 0.6528294086456299, "adv/mean_abs_reasoning": 0.4469456076622009, "adv/mean_abs_step_conf": 0.7476691007614136, "adv/ratio_final_to_reasoning": 1.4606462116505121, "adv/ratio_step_to_reasoning": 1.672841365803281, "adv/std_final_conf": 0.8555540442466736, "adv/std_reasoning": 0.7392084002494812, "adv/std_step_conf": 0.9323121309280396, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7527228581338884, "calib/avg_num_step_conf": 5.23828125, "calib/ece": 0.18209677419354817, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6330645161290323, "calib/gap": 0.23286021505376342, "calib/mean_conf": 0.7982258064516129, "calib/mu_c": 0.8855483870967741, "calib/mu_w": 0.6526881720430107, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1776612903225804, "calib/std_conf": 0.2629716910653483, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.430625, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.032586904761904734, "calib/step_q_w": 0.39803809523809525, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 546.97265625, "completions/mean_terminated_length": 546.97265625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.0544, "grad_norm": 0.04614703357219696, "kl": 0.046630859375, "learning_rate": 4.138888888888889e-06, "loss": -0.0507, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03051183931529522, "mask/share_reasoning": 0.8613395690917969, "mask/share_step_conf": 0.10814858973026276, "num_tokens": 12408469.0, "reward": 0.9670203924179077, "reward_std": 0.17126522958278656, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.8670368194580078, "rewards/final_brier_reward_step": 0.7513788938522339, "rewards/format_reward_step": 0.96875, "step": 51 }, { "adv/mean_abs_final_conf": 0.5877702832221985, "adv/mean_abs_reasoning": 0.37581461668014526, "adv/mean_abs_step_conf": 0.7422032356262207, "adv/ratio_final_to_reasoning": 1.563989949125497, "adv/ratio_step_to_reasoning": 1.9749184908843174, "adv/std_final_conf": 0.8321949243545532, "adv/std_reasoning": 0.6612822413444519, "adv/std_step_conf": 0.9306304454803467, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7521001344086022, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.13200000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.548, "calib/gap": 0.2936441532258066, "calib/mean_conf": 0.7214400000000001, "calib/mu_c": 0.7966129032258066, "calib/mu_w": 0.50296875, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.05472000000000001, "calib/std_conf": 0.3111467923665613, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43921052631578944, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.0395864661654135, "calib/step_q_w": 0.39962406015037594, "calib/step_q_w_n": 399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 542.125, "completions/mean_terminated_length": 546.3936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.055466666666666664, "grad_norm": 9.542619705200195, "kl": 10.608467102050781, "learning_rate": 4.111111111111111e-06, "loss": 0.0357, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030428439378738403, "mask/share_reasoning": 0.8515126705169678, "mask/share_step_conf": 0.11024642735719681, "num_tokens": 12655205.0, "reward": 1.0111010074615479, "reward_std": 0.15742525458335876, "rewards/accuracy_reward_step": 0.73046875, "rewards/asymmetric_l2_reward": 0.8825316429138184, "rewards/final_brier_reward_step": 0.7998265624046326, "rewards/format_reward_step": 0.96875, "step": 52 }, { "adv/mean_abs_final_conf": 0.6738765239715576, "adv/mean_abs_reasoning": 0.5075792074203491, "adv/mean_abs_step_conf": 0.7184747457504272, "adv/ratio_final_to_reasoning": 1.3276283073067061, "adv/ratio_step_to_reasoning": 1.4154928634722936, "adv/std_final_conf": 0.8424535989761353, "adv/std_reasoning": 0.7575823068618774, "adv/std_step_conf": 0.9306389093399048, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6674550299800133, "calib/avg_num_step_conf": 5.80859375, "calib/ece": 0.20988142292490114, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.16155629580279796, "calib/mean_conf": 0.7785770750988144, "calib/mu_c": 0.8392405063291137, "calib/mu_w": 0.6776842105263158, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18197628458498022, "calib/std_conf": 0.29109910124762695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4375678496868476, "calib/step_q_c_n": 958.0, "calib/step_q_gap": 0.016811706019550843, "calib/step_q_w": 0.42075614366729674, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 545.59375, "completions/mean_terminated_length": 545.59375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.05653333333333333, "grad_norm": 0.06023424491286278, "kl": 0.09381866455078125, "learning_rate": 4.083333333333334e-06, "loss": -0.0644, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03022611513733864, "mask/share_reasoning": 0.853451132774353, "mask/share_step_conf": 0.11632277071475983, "num_tokens": 12900701.0, "reward": 0.963081955909729, "reward_std": 0.16977277398109436, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8808630704879761, "rewards/final_brier_reward_step": 0.7242070436477661, "rewards/format_reward_step": 0.98828125, "step": 53 }, { "adv/mean_abs_final_conf": 0.5761524438858032, "adv/mean_abs_reasoning": 0.3302342891693115, "adv/mean_abs_step_conf": 0.7532912492752075, "adv/ratio_final_to_reasoning": 1.744677832623278, "adv/ratio_step_to_reasoning": 2.281081262548706, "adv/std_final_conf": 0.7929055690765381, "adv/std_reasoning": 0.596068799495697, "adv/std_step_conf": 0.9317674040794373, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.8392947834288617, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.11992187500000008, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.64453125, "calib/gap": 0.34614089820793725, "calib/mean_conf": 0.8008593750000002, "calib/mu_c": 0.9049720670391062, "calib/mu_w": 0.5588311688311689, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11078125000000008, "calib/std_conf": 0.27467460107299574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46659142212189614, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.04299142212189622, "calib/step_q_w": 0.4235999999999999, "calib/step_q_w_n": 450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 469.8203125, "completions/mean_terminated_length": 471.66278076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.0576, "grad_norm": 0.14687201380729675, "kl": 0.05890655517578125, "learning_rate": 4.055555555555556e-06, "loss": -0.0315, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034429773688316345, "mask/share_reasoning": 0.8422298431396484, "mask/share_step_conf": 0.11943414062261581, "num_tokens": 13127207.0, "reward": 1.0360541343688965, "reward_std": 0.09538309276103973, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8827582597732544, "rewards/final_brier_reward_step": 0.8495062589645386, "rewards/format_reward_step": 1.0, "step": 54 }, { "adv/mean_abs_final_conf": 0.61080002784729, "adv/mean_abs_reasoning": 0.3844855725765228, "adv/mean_abs_step_conf": 0.7339929342269897, "adv/ratio_final_to_reasoning": 1.5886162483397857, "adv/ratio_step_to_reasoning": 1.9090259468212054, "adv/std_final_conf": 0.8445244431495667, "adv/std_reasoning": 0.6815156936645508, "adv/std_step_conf": 0.9316856861114502, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8308004052684904, "calib/avg_num_step_conf": 4.9296875, "calib/ece": 0.25086956521739145, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6482213438735178, "calib/gap": 0.2831933890577505, "calib/mean_conf": 0.7950592885375494, "calib/mu_c": 0.9204255319148934, "calib/mu_w": 0.6372321428571429, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24430830039525706, "calib/std_conf": 0.2872626988401877, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47453996983408747, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.07060007000103241, "calib/step_q_w": 0.40393989983305506, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 499.4375, "completions/mean_terminated_length": 501.3961181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.058666666666666666, "grad_norm": 0.06633848696947098, "kl": 0.06307220458984375, "learning_rate": 4.027777777777779e-06, "loss": -0.0112, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03279150649905205, "mask/share_reasoning": 0.8528703451156616, "mask/share_step_conf": 0.11043195426464081, "num_tokens": 13362887.0, "reward": 0.9695348739624023, "reward_std": 0.16215607523918152, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.8861154317855835, "rewards/final_brier_reward_step": 0.745141863822937, "rewards/format_reward_step": 0.98828125, "step": 55 }, { "adv/mean_abs_final_conf": 0.6004599332809448, "adv/mean_abs_reasoning": 0.48417240381240845, "adv/mean_abs_step_conf": 0.7438913583755493, "adv/ratio_final_to_reasoning": 1.2401779377611777, "adv/ratio_step_to_reasoning": 1.536418334704942, "adv/std_final_conf": 0.8093182444572449, "adv/std_reasoning": 0.7393047213554382, "adv/std_step_conf": 0.9322188496589661, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6902107758344544, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.40175999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.844, "calib/gap": 0.11437247741687495, "calib/mean_conf": 0.9097600000000001, "calib/mu_c": 0.9651162790697675, "calib/mu_w": 0.8507438016528925, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.39775999999999995, "calib/std_conf": 0.21169020383569948, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4875795297372061, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.03625405109213975, "calib/step_q_w": 0.45132547864506634, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 526.91796875, "completions/mean_terminated_length": 528.984375, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.05973333333333333, "grad_norm": 0.1122933179140091, "kl": 0.05873870849609375, "learning_rate": 4.000000000000001e-06, "loss": -0.0373, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032018400728702545, "mask/share_reasoning": 0.8491096496582031, "mask/share_step_conf": 0.11496569961309433, "num_tokens": 13604618.0, "reward": 0.8735764026641846, "reward_std": 0.1838463842868805, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.8586329221725464, "rewards/final_brier_reward_step": 0.5932074189186096, "rewards/format_reward_step": 0.97265625, "step": 56 }, { "adv/mean_abs_final_conf": 0.5015829205513, "adv/mean_abs_reasoning": 0.3861631751060486, "adv/mean_abs_step_conf": 0.7528830766677856, "adv/ratio_final_to_reasoning": 1.2988885343962555, "adv/ratio_step_to_reasoning": 1.949650109596877, "adv/std_final_conf": 0.742138147354126, "adv/std_reasoning": 0.6614132523536682, "adv/std_step_conf": 0.9319602251052856, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7295296167247386, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.27711999999999987, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.88, "calib/gap": 0.10993176538908245, "calib/mean_conf": 0.93424, "calib/mu_c": 0.970297619047619, "calib/mu_w": 0.8603658536585366, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26967999999999986, "calib/std_conf": 0.17799444485713592, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5003476245654692, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.05931551447372613, "calib/step_q_w": 0.4410321100917431, "calib/step_q_w_n": 436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 518.72265625, "completions/mean_terminated_length": 524.87353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.0608, "grad_norm": 0.05452043563127518, "kl": 0.05272674560546875, "learning_rate": 3.972222222222223e-06, "loss": -0.0571, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03176348656415939, "mask/share_reasoning": 0.8486981987953186, "mask/share_step_conf": 0.10781954228878021, "num_tokens": 13844203.0, "reward": 0.9544321894645691, "reward_std": 0.1735970377922058, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.8717612028121948, "rewards/final_brier_reward_step": 0.7105406522750854, "rewards/format_reward_step": 0.9765625, "step": 57 }, { "adv/mean_abs_final_conf": 0.653085470199585, "adv/mean_abs_reasoning": 0.5664516687393188, "adv/mean_abs_step_conf": 0.7543550133705139, "adv/ratio_final_to_reasoning": 1.1529412061810609, "adv/ratio_step_to_reasoning": 1.331719994839786, "adv/std_final_conf": 0.8459213376045227, "adv/std_reasoning": 0.7929334044456482, "adv/std_step_conf": 0.9330338835716248, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5168343526007759, "calib/avg_num_step_conf": 6.296875, "calib/ece": 0.37165322580645166, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7903225806451613, "calib/gap": 0.026798842638258824, "calib/mean_conf": 0.8870564516129033, "calib/mu_c": 0.899051094890511, "calib/mu_w": 0.8722522522522522, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.35314516129032264, "calib/std_conf": 0.22490635335836415, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4674119076549211, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.012785798656188507, "calib/step_q_w": 0.45462610899873257, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 631.66796875, "completions/mean_terminated_length": 634.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.06186666666666667, "grad_norm": 0.0428379587829113, "kl": 0.039905548095703125, "learning_rate": 3.944444444444445e-06, "loss": -0.0225, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02726609632372856, "mask/share_reasoning": 0.8649106621742249, "mask/share_step_conf": 0.10391701012849808, "num_tokens": 14112230.0, "reward": 0.8503095507621765, "reward_std": 0.23419946432113647, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.8246171474456787, "rewards/final_brier_reward_step": 0.5775644779205322, "rewards/format_reward_step": 0.95703125, "step": 58 }, { "adv/mean_abs_final_conf": 0.6193721890449524, "adv/mean_abs_reasoning": 0.4775833189487457, "adv/mean_abs_step_conf": 0.7455660104751587, "adv/ratio_final_to_reasoning": 1.2968882380739588, "adv/ratio_step_to_reasoning": 1.561122386176082, "adv/std_final_conf": 0.8122193217277527, "adv/std_reasoning": 0.7393720746040344, "adv/std_step_conf": 0.933074951171875, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6466128807900959, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.3228112449799195, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8273092369477911, "calib/gap": 0.058149951314508286, "calib/mean_conf": 0.9030522088353414, "calib/mu_c": 0.9243037974683543, "calib/mu_w": 0.866153846153846, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2956626506024094, "calib/std_conf": 0.2181105456543611, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5174651162790698, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.03366919791172285, "calib/step_q_w": 0.4837959183673469, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 548.95703125, "completions/mean_terminated_length": 553.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.06293333333333333, "grad_norm": 0.05327894538640976, "kl": 0.057308197021484375, "learning_rate": 3.916666666666667e-06, "loss": -0.0296, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03187629580497742, "mask/share_reasoning": 0.85688316822052, "mask/share_step_conf": 0.10342804342508316, "num_tokens": 14359011.0, "reward": 0.9090771079063416, "reward_std": 0.21725571155548096, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8475908041000366, "rewards/final_brier_reward_step": 0.6533757448196411, "rewards/format_reward_step": 0.96875, "step": 59 }, { "adv/mean_abs_final_conf": 0.6204248070716858, "adv/mean_abs_reasoning": 0.4606407880783081, "adv/mean_abs_step_conf": 0.7363015413284302, "adv/ratio_final_to_reasoning": 1.346873362343707, "adv/ratio_step_to_reasoning": 1.5984288851191792, "adv/std_final_conf": 0.8465948104858398, "adv/std_reasoning": 0.7391853928565979, "adv/std_step_conf": 0.9324144124984741, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6778369905956112, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.3437254901960784, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.796078431372549, "calib/gap": 0.1016175548589342, "calib/mean_conf": 0.8845098039215686, "calib/mu_c": 0.928344827586207, "calib/mu_w": 0.8267272727272728, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3298039215686274, "calib/std_conf": 0.23504954609522094, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5443636363636363, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.06410091058366907, "calib/step_q_w": 0.4802627257799672, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 518.02734375, "completions/mean_terminated_length": 518.02734375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.064, "grad_norm": 0.04044271260499954, "kl": 0.048015594482421875, "learning_rate": 3.88888888888889e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03257352113723755, "mask/share_reasoning": 0.8600834012031555, "mask/share_step_conf": 0.10734307020902634, "num_tokens": 14600482.0, "reward": 0.9168381690979004, "reward_std": 0.17467540502548218, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.8741821050643921, "rewards/final_brier_reward_step": 0.6469941139221191, "rewards/format_reward_step": 0.99609375, "step": 60 }, { "adv/mean_abs_final_conf": 0.5063987374305725, "adv/mean_abs_reasoning": 0.3942033052444458, "adv/mean_abs_step_conf": 0.7471913695335388, "adv/ratio_final_to_reasoning": 1.284613118899534, "adv/ratio_step_to_reasoning": 1.8954467392661887, "adv/std_final_conf": 0.7494310140609741, "adv/std_reasoning": 0.6815856695175171, "adv/std_step_conf": 0.9329997897148132, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6456996148908857, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.26280632411067195, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8537549407114624, "calib/gap": 0.10839751818570809, "calib/mean_conf": 0.9224110671936759, "calib/mu_c": 0.9575438596491228, "calib/mu_w": 0.8491463414634147, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2546640316205534, "calib/std_conf": 0.18691619784239447, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5392926829268293, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.07070861213036911, "calib/step_q_w": 0.4685840707964602, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 446.3984375, "completions/mean_terminated_length": 446.3984375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.039539139717817307, "kl": 0.0517120361328125, "learning_rate": 3.861111111111112e-06, "loss": -0.0068, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038807108998298645, "mask/share_reasoning": 0.8420302867889404, "mask/share_step_conf": 0.11916261911392212, "num_tokens": 14818824.0, "reward": 0.9512588381767273, "reward_std": 0.16452768445014954, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8505984544754028, "rewards/final_brier_reward_step": 0.7214503288269043, "rewards/format_reward_step": 0.984375, "step": 61 }, { "adv/mean_abs_final_conf": 0.7046828269958496, "adv/mean_abs_reasoning": 0.6048574447631836, "adv/mean_abs_step_conf": 0.782432496547699, "adv/ratio_final_to_reasoning": 1.165039519802472, "adv/ratio_step_to_reasoning": 1.293581658491515, "adv/std_final_conf": 0.8792684078216553, "adv/std_reasoning": 0.8099877238273621, "adv/std_step_conf": 0.9344117045402527, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.590156823490157, "calib/avg_num_step_conf": 5.34375, "calib/ece": 0.3766260162601627, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7479674796747967, "calib/gap": 0.04934334334334367, "calib/mean_conf": 0.8574390243902439, "calib/mu_c": 0.8797037037037038, "calib/mu_w": 0.8303603603603601, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3426422764227643, "calib/std_conf": 0.25203470035773656, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5694727592267135, "calib/step_q_c_n": 569.0, "calib/step_q_gap": 0.0766567392016822, "calib/step_q_w": 0.4928160200250313, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 535.71875, "completions/mean_terminated_length": 539.93701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.06613333333333334, "grad_norm": 0.05797210708260536, "kl": 0.049541473388671875, "learning_rate": 3.833333333333334e-06, "loss": -0.0318, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03127940744161606, "mask/share_reasoning": 0.858956515789032, "mask/share_step_conf": 0.10195156186819077, "num_tokens": 15063048.0, "reward": 0.8401246070861816, "reward_std": 0.255887508392334, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.7979668378829956, "rewards/final_brier_reward_step": 0.5869699120521545, "rewards/format_reward_step": 0.94921875, "step": 62 }, { "adv/mean_abs_final_conf": 0.7219770550727844, "adv/mean_abs_reasoning": 0.5437690019607544, "adv/mean_abs_step_conf": 0.7378803491592407, "adv/ratio_final_to_reasoning": 1.3277274954428018, "adv/ratio_step_to_reasoning": 1.3569739107940102, "adv/std_final_conf": 0.8784008622169495, "adv/std_reasoning": 0.7928453683853149, "adv/std_step_conf": 0.9337167739868164, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7164169119614665, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.2565194109772422, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6706827309236948, "calib/gap": 0.1969813129961645, "calib/mean_conf": 0.821338688085676, "calib/mu_c": 0.9012387387387387, "calib/mu_w": 0.7042574257425742, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2417402945113787, "calib/std_conf": 0.25969706026981004, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5517697841726619, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.06320769266939391, "calib/step_q_w": 0.48856209150326796, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 574.24609375, "completions/mean_terminated_length": 576.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0672, "grad_norm": 0.043521128594875336, "kl": 0.037807464599609375, "learning_rate": 3.8055555555555556e-06, "loss": -0.0087, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031664442270994186, "mask/share_reasoning": 0.8629859089851379, "mask/share_step_conf": 0.10144336521625519, "num_tokens": 15318695.0, "reward": 0.932765007019043, "reward_std": 0.20682235062122345, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8452221155166626, "rewards/final_brier_reward_step": 0.7109330296516418, "rewards/format_reward_step": 0.96875, "step": 63 }, { "adv/mean_abs_final_conf": 0.6922527551651001, "adv/mean_abs_reasoning": 0.5614722967147827, "adv/mean_abs_step_conf": 0.7511861324310303, "adv/ratio_final_to_reasoning": 1.2329241517622933, "adv/ratio_step_to_reasoning": 1.3378863691517422, "adv/std_final_conf": 0.8831756711006165, "adv/std_reasoning": 0.7928008437156677, "adv/std_step_conf": 0.9331434369087219, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6446036498431708, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.1753386454183265, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6254980079681275, "calib/gap": 0.18141930424864539, "calib/mean_conf": 0.7977290836653387, "calib/mu_c": 0.858443113772455, "calib/mu_w": 0.6770238095238096, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15386454183266915, "calib/std_conf": 0.2702736224399446, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5586697782963826, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.035542024992417875, "calib/step_q_w": 0.5231277533039648, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 513.83984375, "completions/mean_terminated_length": 515.8549194335938, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.06826666666666667, "grad_norm": 0.15224145352840424, "kl": 0.09376144409179688, "learning_rate": 3.777777777777778e-06, "loss": 0.0487, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032815635204315186, "mask/share_reasoning": 0.8548951148986816, "mask/share_step_conf": 0.10838305950164795, "num_tokens": 15554014.0, "reward": 0.9618469476699829, "reward_std": 0.19402143359184265, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.8472946882247925, "rewards/final_brier_reward_step": 0.7498366832733154, "rewards/format_reward_step": 0.9765625, "step": 64 }, { "adv/mean_abs_final_conf": 0.6335813403129578, "adv/mean_abs_reasoning": 0.36810600757598877, "adv/mean_abs_step_conf": 0.7440919876098633, "adv/ratio_final_to_reasoning": 1.7211926110229714, "adv/ratio_step_to_reasoning": 2.021406802105122, "adv/std_final_conf": 0.8394150733947754, "adv/std_reasoning": 0.6611788272857666, "adv/std_step_conf": 0.933049201965332, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7085637993515164, "calib/avg_num_step_conf": 4.49609375, "calib/ece": 0.28799212598425206, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7086614173228346, "calib/gap": 0.13941445737173375, "calib/mean_conf": 0.8620866141732284, "calib/mu_c": 0.9208163265306123, "calib/mu_w": 0.7814018691588785, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.28566929133858276, "calib/std_conf": 0.2172899845365054, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6184561891515994, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.007021003966414252, "calib/step_q_w": 0.6114351851851851, "calib/step_q_w_n": 432.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 425.75, "completions/mean_terminated_length": 427.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.06933333333333333, "grad_norm": 0.05041688680648804, "kl": 0.04650115966796875, "learning_rate": 3.7500000000000005e-06, "loss": -0.044, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.038847897201776505, "mask/share_reasoning": 0.8437234163284302, "mask/share_step_conf": 0.1135224848985672, "num_tokens": 15768030.0, "reward": 0.9121578335762024, "reward_std": 0.1568649709224701, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.8197988271713257, "rewards/final_brier_reward_step": 0.6912355422973633, "rewards/format_reward_step": 0.9921875, "step": 65 }, { "adv/mean_abs_final_conf": 0.699600338935852, "adv/mean_abs_reasoning": 0.516379177570343, "adv/mean_abs_step_conf": 0.7368757724761963, "adv/ratio_final_to_reasoning": 1.3548190347790505, "adv/ratio_step_to_reasoning": 1.4270052017653567, "adv/std_final_conf": 0.8724037408828735, "adv/std_reasoning": 0.7927989959716797, "adv/std_step_conf": 0.9339161515235901, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6934820904286553, "calib/avg_num_step_conf": 5.796875, "calib/ece": 0.20552419354838708, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4475806451612903, "calib/gap": 0.17645005545768921, "calib/mean_conf": 0.7271370967741936, "calib/mu_c": 0.810381679389313, "calib/mu_w": 0.6339316239316238, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20221774193548386, "calib/std_conf": 0.26080417924978105, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6141432791728212, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.06765505116786441, "calib/step_q_w": 0.5464882280049568, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 548.875, "completions/mean_terminated_length": 548.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0704, "grad_norm": 0.03213903680443764, "kl": 0.04193115234375, "learning_rate": 3.7222222222222225e-06, "loss": 0.1051, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03228248655796051, "mask/share_reasoning": 0.8526995182037354, "mask/share_step_conf": 0.11501805484294891, "num_tokens": 16014894.0, "reward": 0.9093428254127502, "reward_std": 0.187605082988739, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.8142845034599304, "rewards/final_brier_reward_step": 0.7083073854446411, "rewards/format_reward_step": 0.96875, "step": 66 }, { "adv/mean_abs_final_conf": 0.6824854612350464, "adv/mean_abs_reasoning": 0.29309481382369995, "adv/mean_abs_step_conf": 0.7393745183944702, "adv/ratio_final_to_reasoning": 2.328548404973039, "adv/ratio_step_to_reasoning": 2.5226462002129213, "adv/std_final_conf": 0.870021402835846, "adv/std_reasoning": 0.6184077858924866, "adv/std_step_conf": 0.9328907132148743, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8083670715249662, "calib/avg_num_step_conf": 4.8515625, "calib/ece": 0.13792828685258968, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.47410358565737054, "calib/gap": 0.27966531713900145, "calib/mean_conf": 0.7428685258964144, "calib/mu_c": 0.8487179487179489, "calib/mu_w": 0.5690526315789475, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12964143426294825, "calib/std_conf": 0.2607859302946761, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6446345177664974, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.061043768867818926, "calib/step_q_w": 0.5835907488986785, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 531.15625, "completions/mean_terminated_length": 531.15625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.07146666666666666, "grad_norm": 0.045603763312101364, "kl": 0.057559967041015625, "learning_rate": 3.694444444444445e-06, "loss": 0.0319, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03274885565042496, "mask/share_reasoning": 0.8626049160957336, "mask/share_step_conf": 0.10464620590209961, "num_tokens": 16255878.0, "reward": 0.9663029909133911, "reward_std": 0.13919922709465027, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.8169246912002563, "rewards/final_brier_reward_step": 0.7977124452590942, "rewards/format_reward_step": 0.98046875, "step": 67 }, { "adv/mean_abs_final_conf": 0.729244589805603, "adv/mean_abs_reasoning": 0.5466316938400269, "adv/mean_abs_step_conf": 0.739362895488739, "adv/ratio_final_to_reasoning": 1.3340693524057137, "adv/ratio_step_to_reasoning": 1.3525796323568378, "adv/std_final_conf": 0.910574197769165, "adv/std_reasoning": 0.7928569316864014, "adv/std_step_conf": 0.9348320960998535, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6934589041095891, "calib/avg_num_step_conf": 5.1171875, "calib/ece": 0.14150406504065038, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4024390243902439, "calib/gap": 0.18019315068493158, "calib/mean_conf": 0.7252439024390244, "calib/mu_c": 0.7984931506849315, "calib/mu_w": 0.6183, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.13662601626016257, "calib/std_conf": 0.2594485659967677, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6530057803468208, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.07198021400377874, "calib/step_q_w": 0.581025566343042, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 501.19140625, "completions/mean_terminated_length": 503.1568908691406, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.07253333333333334, "grad_norm": 0.07743779569864273, "kl": 0.046253204345703125, "learning_rate": 3.6666666666666666e-06, "loss": 0.0019, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03694310039281845, "mask/share_reasoning": 0.8424147963523865, "mask/share_step_conf": 0.11673584580421448, "num_tokens": 16488271.0, "reward": 0.9158475399017334, "reward_std": 0.21296223998069763, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.8014019727706909, "rewards/final_brier_reward_step": 0.7256054878234863, "rewards/format_reward_step": 0.953125, "step": 68 }, { "adv/mean_abs_final_conf": 0.7655402421951294, "adv/mean_abs_reasoning": 0.5517368316650391, "adv/mean_abs_step_conf": 0.7545615434646606, "adv/ratio_final_to_reasoning": 1.3875097659963562, "adv/ratio_step_to_reasoning": 1.3676113323584622, "adv/std_final_conf": 0.9263461232185364, "adv/std_reasoning": 0.792895495891571, "adv/std_step_conf": 0.9345738887786865, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6088992974238876, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.19758064516129042, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.2862903225806452, "calib/gap": 0.10129065833983864, "calib/mean_conf": 0.6591935483870969, "calib/mu_c": 0.7106557377049181, "calib/mu_w": 0.6093650793650794, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18241935483870975, "calib/std_conf": 0.24994224098648157, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6377577933450087, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.03260541945776829, "calib/step_q_w": 0.6051523738872404, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 565.609375, "completions/mean_terminated_length": 567.8275146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0736, "grad_norm": 0.04150233417749405, "kl": 0.0474853515625, "learning_rate": 3.638888888888889e-06, "loss": -0.0577, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03185002878308296, "mask/share_reasoning": 0.8687409162521362, "mask/share_step_conf": 0.09550271928310394, "num_tokens": 16737563.0, "reward": 0.8744354844093323, "reward_std": 0.19976764917373657, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.7783041596412659, "rewards/final_brier_reward_step": 0.6830667853355408, "rewards/format_reward_step": 0.9609375, "step": 69 }, { "adv/mean_abs_final_conf": 0.7534902691841125, "adv/mean_abs_reasoning": 0.4766578674316406, "adv/mean_abs_step_conf": 0.7700801491737366, "adv/ratio_final_to_reasoning": 1.5807779975270702, "adv/ratio_step_to_reasoning": 1.6155825840516034, "adv/std_final_conf": 0.9181990027427673, "adv/std_reasoning": 0.720661461353302, "adv/std_step_conf": 0.9350027441978455, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8443896507464703, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.20327868852459027, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.3524590163934426, "calib/gap": 0.3460663379044788, "calib/mean_conf": 0.6605737704918033, "calib/mu_c": 0.8463716814159292, "calib/mu_w": 0.5003053435114504, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2003688524590165, "calib/std_conf": 0.28365674008783714, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6533443708609271, "calib/step_q_c_n": 604.0, "calib/step_q_gap": 0.07408964931129025, "calib/step_q_w": 0.5792547215496369, "calib/step_q_w_n": 826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 599.86328125, "completions/mean_terminated_length": 599.86328125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.07466666666666667, "grad_norm": 0.04824332147836685, "kl": 0.04335784912109375, "learning_rate": 3.6111111111111115e-06, "loss": 0.0352, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.033216990530490875, "mask/share_reasoning": 0.8483107686042786, "mask/share_step_conf": 0.11847224086523056, "num_tokens": 16998120.0, "reward": 0.8946191072463989, "reward_std": 0.18673905730247498, "rewards/accuracy_reward_step": 0.44140625, "rewards/asymmetric_l2_reward": 0.7578139305114746, "rewards/final_brier_reward_step": 0.7556430101394653, "rewards/format_reward_step": 0.9375, "step": 70 }, { "adv/mean_abs_final_conf": 0.7033551931381226, "adv/mean_abs_reasoning": 0.5371626615524292, "adv/mean_abs_step_conf": 0.746070146560669, "adv/ratio_final_to_reasoning": 1.3093895824877104, "adv/ratio_step_to_reasoning": 1.3889091702771854, "adv/std_final_conf": 0.8913128972053528, "adv/std_reasoning": 0.7754039168357849, "adv/std_step_conf": 0.9348368048667908, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6245294234198534, "calib/avg_num_step_conf": 5.86328125, "calib/ece": 0.22508000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.472, "calib/gap": 0.11424278449243785, "calib/mean_conf": 0.723, "calib/mu_c": 0.7700680272108843, "calib/mu_w": 0.6558252427184464, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18004000000000003, "calib/std_conf": 0.28955034104625055, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6275355670103093, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.03587267045858522, "calib/step_q_w": 0.5916628965517241, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 533.4921875, "completions/mean_terminated_length": 535.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.07573333333333333, "grad_norm": 0.036347754299640656, "kl": 0.049762725830078125, "learning_rate": 3.5833333333333335e-06, "loss": -0.0425, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0346219427883625, "mask/share_reasoning": 0.8352914452552795, "mask/share_step_conf": 0.12618035078048706, "num_tokens": 17239102.0, "reward": 0.8964895009994507, "reward_std": 0.19243742525577545, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.7922806143760681, "rewards/final_brier_reward_step": 0.691323459148407, "rewards/format_reward_step": 0.97265625, "step": 71 }, { "adv/mean_abs_final_conf": 0.6750290393829346, "adv/mean_abs_reasoning": 0.45832559466362, "adv/mean_abs_step_conf": 0.7397451400756836, "adv/ratio_final_to_reasoning": 1.4728154989431919, "adv/ratio_step_to_reasoning": 1.614016648183496, "adv/std_final_conf": 0.8813406229019165, "adv/std_reasoning": 0.7574042081832886, "adv/std_step_conf": 0.935146689414978, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7351403061224491, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.21115079365079364, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.47619047619047616, "calib/gap": 0.22401785714285705, "calib/mean_conf": 0.7532936507936507, "calib/mu_c": 0.8528571428571429, "calib/mu_w": 0.6288392857142858, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.20444444444444443, "calib/std_conf": 0.2702509582515445, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.647175965665236, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.05842441173470592, "calib/step_q_w": 0.5887515539305301, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 483.91015625, "completions/mean_terminated_length": 483.91015625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.0768, "grad_norm": 0.31355804204940796, "kl": 0.11542510986328125, "learning_rate": 3.555555555555556e-06, "loss": 0.0661, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034122809767723083, "mask/share_reasoning": 0.8555728197097778, "mask/share_step_conf": 0.11030436307191849, "num_tokens": 17467391.0, "reward": 0.9290406107902527, "reward_std": 0.1833093762397766, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.8128567934036255, "rewards/final_brier_reward_step": 0.7397554516792297, "rewards/format_reward_step": 0.98046875, "step": 72 }, { "adv/mean_abs_final_conf": 0.6979169249534607, "adv/mean_abs_reasoning": 0.611485481262207, "adv/mean_abs_step_conf": 0.7284319400787354, "adv/ratio_final_to_reasoning": 1.1413466817116327, "adv/ratio_step_to_reasoning": 1.191249771908127, "adv/std_final_conf": 0.8787330389022827, "adv/std_reasoning": 0.8267027735710144, "adv/std_step_conf": 0.9347501993179321, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.765242718446602, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.2145454545454545, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5889328063241107, "calib/gap": 0.232482200647249, "calib/mean_conf": 0.7950197628458499, "calib/mu_c": 0.8896666666666666, "calib/mu_w": 0.6571844660194176, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.20833992094861656, "calib/std_conf": 0.2677967852430365, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6364171390013496, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.059897394942848625, "calib/step_q_w": 0.5765197440585009, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 468.234375, "completions/mean_terminated_length": 468.234375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07786666666666667, "grad_norm": 0.027293583378195763, "kl": 0.050006866455078125, "learning_rate": 3.5277777777777784e-06, "loss": -0.0106, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0346967875957489, "mask/share_reasoning": 0.8508319854736328, "mask/share_step_conf": 0.11447125673294067, "num_tokens": 17694291.0, "reward": 0.9352380037307739, "reward_std": 0.20372015237808228, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8135044574737549, "rewards/final_brier_reward_step": 0.7436902523040771, "rewards/format_reward_step": 0.98046875, "step": 73 }, { "adv/mean_abs_final_conf": 0.6834238767623901, "adv/mean_abs_reasoning": 0.4758460521697998, "adv/mean_abs_step_conf": 0.7555922269821167, "adv/ratio_final_to_reasoning": 1.4362289518764753, "adv/ratio_step_to_reasoning": 1.587892183904245, "adv/std_final_conf": 0.883694052696228, "adv/std_reasoning": 0.7206491827964783, "adv/std_step_conf": 0.92009037733078, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.741551724137931, "calib/avg_num_step_conf": 4.6953125, "calib/ece": 0.22875518672199174, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4730290456431535, "calib/gap": 0.2606606896551724, "calib/mean_conf": 0.7315767634854772, "calib/mu_c": 0.85704, "calib/mu_w": 0.5963793103448276, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.22082987551867225, "calib/std_conf": 0.3010369411849477, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.6306426644182125, "calib/step_q_c_n": 593.0, "calib/step_q_gap": 0.06642263157748995, "calib/step_q_w": 0.5642200328407225, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 521.0859375, "completions/mean_terminated_length": 523.1294555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.07893333333333333, "grad_norm": 0.0466296449303627, "kl": 0.0598602294921875, "learning_rate": 3.5e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.0345626175403595, "mask/share_reasoning": 0.8509548306465149, "mask/share_step_conf": 0.1105763390660286, "num_tokens": 17931617.0, "reward": 0.8789986968040466, "reward_std": 0.19282305240631104, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.7778134346008301, "rewards/final_brier_reward_step": 0.6950277090072632, "rewards/format_reward_step": 0.93359375, "step": 74 }, { "adv/mean_abs_final_conf": 0.628893256187439, "adv/mean_abs_reasoning": 0.4681423604488373, "adv/mean_abs_step_conf": 0.7405921816825867, "adv/ratio_final_to_reasoning": 1.3433803674260107, "adv/ratio_step_to_reasoning": 1.581980705554043, "adv/std_final_conf": 0.8261920809745789, "adv/std_reasoning": 0.7575705051422119, "adv/std_step_conf": 0.9339452981948853, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8038342655284075, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.18517928286852595, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7330677290836654, "calib/gap": 0.26919782160730044, "calib/mean_conf": 0.8500398406374503, "calib/mu_c": 0.934767441860465, "calib/mu_w": 0.6655696202531646, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.17498007968127494, "calib/std_conf": 0.2661478156350438, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6109479191438763, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.051716654227408454, "calib/step_q_w": 0.5592312649164678, "calib/step_q_w_n": 419.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 469.1328125, "completions/mean_terminated_length": 469.1328125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.08, "grad_norm": 0.02828267775475979, "kl": 0.0692291259765625, "learning_rate": 3.4722222222222224e-06, "loss": -0.0302, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035453006625175476, "mask/share_reasoning": 0.8479112386703491, "mask/share_step_conf": 0.1166357472538948, "num_tokens": 18156467.0, "reward": 0.9559888243675232, "reward_std": 0.22168438136577606, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8216685652732849, "rewards/final_brier_reward_step": 0.7645277380943298, "rewards/format_reward_step": 0.95703125, "step": 75 }, { "adv/mean_abs_final_conf": 0.622812032699585, "adv/mean_abs_reasoning": 0.5359321236610413, "adv/mean_abs_step_conf": 0.7628259062767029, "adv/ratio_final_to_reasoning": 1.1621099113168523, "adv/ratio_step_to_reasoning": 1.4233629084700363, "adv/std_final_conf": 0.8540507555007935, "adv/std_reasoning": 0.7928803563117981, "adv/std_step_conf": 0.9348656535148621, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7480906148867315, "calib/avg_num_step_conf": 4.6171875, "calib/ece": 0.25411067193675896, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6956521739130435, "calib/gap": 0.22156634304207123, "calib/mean_conf": 0.8291304347826086, "calib/mu_c": 0.9193333333333332, "calib/mu_w": 0.697766990291262, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.24517786561264823, "calib/std_conf": 0.2779797863926332, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.595253709198813, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.01054898478936428, "calib/step_q_w": 0.5847047244094488, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 486.13671875, "completions/mean_terminated_length": 488.04315185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.08106666666666666, "grad_norm": 0.043174050748348236, "kl": 0.0677642822265625, "learning_rate": 3.444444444444445e-06, "loss": -0.0746, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03488195687532425, "mask/share_reasoning": 0.8566423058509827, "mask/share_step_conf": 0.10456950962543488, "num_tokens": 18383974.0, "reward": 0.9037140607833862, "reward_std": 0.22321432828903198, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7896616458892822, "rewards/final_brier_reward_step": 0.7076101303100586, "rewards/format_reward_step": 0.96484375, "step": 76 }, { "adv/mean_abs_final_conf": 0.6962723731994629, "adv/mean_abs_reasoning": 0.5586796998977661, "adv/mean_abs_step_conf": 0.7595263123512268, "adv/ratio_final_to_reasoning": 1.2462818558234263, "adv/ratio_step_to_reasoning": 1.359502255926274, "adv/std_final_conf": 0.863667905330658, "adv/std_reasoning": 0.7929328680038452, "adv/std_step_conf": 0.9350054860115051, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6630559540889527, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.22903614457831326, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6305220883534136, "calib/gap": 0.147654949784792, "calib/mean_conf": 0.8014859437751004, "calib/mu_c": 0.851890243902439, "calib/mu_w": 0.704235294117647, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.1859437751004016, "calib/std_conf": 0.29090324783614796, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5586034255599472, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.03651251646903819, "calib/step_q_w": 0.522090909090909, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 512.046875, "completions/mean_terminated_length": 512.046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.08213333333333334, "grad_norm": 0.025604577735066414, "kl": 0.06995391845703125, "learning_rate": 3.416666666666667e-06, "loss": -0.0046, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03763299062848091, "mask/share_reasoning": 0.8435276746749878, "mask/share_step_conf": 0.11883929371833801, "num_tokens": 18619722.0, "reward": 0.9079852104187012, "reward_std": 0.24078664183616638, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.8027098774909973, "rewards/final_brier_reward_step": 0.696854293346405, "rewards/format_reward_step": 0.94140625, "step": 77 }, { "adv/mean_abs_final_conf": 0.7457473278045654, "adv/mean_abs_reasoning": 0.6220015287399292, "adv/mean_abs_step_conf": 0.7352344989776611, "adv/ratio_final_to_reasoning": 1.1989477410374287, "adv/ratio_step_to_reasoning": 1.1820461285153478, "adv/std_final_conf": 0.9125654101371765, "adv/std_reasoning": 0.8429985642433167, "adv/std_step_conf": 0.9346243143081665, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6882824726134585, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.2450129333333334, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.572, "calib/gap": 0.196561737089202, "calib/mean_conf": 0.7558137333333333, "calib/mu_c": 0.8407284037558685, "calib/mu_w": 0.6441666666666666, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.2164133333333334, "calib/std_conf": 0.3190380118248817, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5449234693877552, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.02516442176870748, "calib/step_q_w": 0.5197590476190477, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 531.875, "completions/mean_terminated_length": 533.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.0832, "grad_norm": 0.02739427238702774, "kl": 0.0735321044921875, "learning_rate": 3.3888888888888893e-06, "loss": -0.1401, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03082401677966118, "mask/share_reasoning": 0.8617717027664185, "mask/share_step_conf": 0.10349804162979126, "num_tokens": 18863906.0, "reward": 0.8943131566047668, "reward_std": 0.24572047591209412, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8101315498352051, "rewards/final_brier_reward_step": 0.6777135133743286, "rewards/format_reward_step": 0.94921875, "step": 78 }, { "adv/mean_abs_final_conf": 0.6829380989074707, "adv/mean_abs_reasoning": 0.46346795558929443, "adv/mean_abs_step_conf": 0.7522290945053101, "adv/ratio_final_to_reasoning": 1.47353898078913, "adv/ratio_step_to_reasoning": 1.6230444530924668, "adv/std_final_conf": 0.8693846464157104, "adv/std_reasoning": 0.7205665707588196, "adv/std_step_conf": 0.9347177147865295, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6895833333333334, "calib/avg_num_step_conf": 5.640625, "calib/ece": 0.24385826771653552, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6417322834645669, "calib/gap": 0.20205384615384603, "calib/mean_conf": 0.7956692913385828, "calib/mu_c": 0.8784, "calib/mu_w": 0.6763461538461539, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22448818897637804, "calib/std_conf": 0.3087352675593296, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5525216316440049, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.06615942691959542, "calib/step_q_w": 0.4863622047244095, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 510.8671875, "completions/mean_terminated_length": 514.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.08426666666666667, "grad_norm": 0.031064137816429138, "kl": 0.07048797607421875, "learning_rate": 3.3611111111111117e-06, "loss": -0.0686, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033156618475914, "mask/share_reasoning": 0.8510840535163879, "mask/share_step_conf": 0.10794685781002045, "num_tokens": 19101064.0, "reward": 0.9335497617721558, "reward_std": 0.19944220781326294, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.847852885723114, "rewards/final_brier_reward_step": 0.7051839828491211, "rewards/format_reward_step": 0.984375, "step": 79 }, { "adv/mean_abs_final_conf": 0.6879172325134277, "adv/mean_abs_reasoning": 0.5587252378463745, "adv/mean_abs_step_conf": 0.7600916028022766, "adv/ratio_final_to_reasoning": 1.231226345108426, "adv/ratio_step_to_reasoning": 1.3604032023541224, "adv/std_final_conf": 0.8734791278839111, "adv/std_reasoning": 0.7755234241485596, "adv/std_step_conf": 0.9350786805152893, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7002287581699347, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.2808827404479579, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.782608695652174, "calib/gap": 0.18351067538126353, "calib/mean_conf": 0.876376811594203, "calib/mu_c": 0.9489106753812636, "calib/mu_w": 0.7654000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.27625823451910414, "calib/std_conf": 0.2520777155119635, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5498933962264151, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.07029712293448959, "calib/step_q_w": 0.4795962732919255, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 464.578125, "completions/mean_terminated_length": 466.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.08533333333333333, "grad_norm": 0.025256939232349396, "kl": 0.0876312255859375, "learning_rate": 3.3333333333333333e-06, "loss": -0.0673, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03492242097854614, "mask/share_reasoning": 0.8304387331008911, "mask/share_step_conf": 0.13073261082172394, "num_tokens": 19322156.0, "reward": 0.9114360809326172, "reward_std": 0.23360256850719452, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.8124538660049438, "rewards/final_brier_reward_step": 0.6955744028091431, "rewards/format_reward_step": 0.9765625, "step": 80 }, { "adv/mean_abs_final_conf": 0.7039898633956909, "adv/mean_abs_reasoning": 0.5247969627380371, "adv/mean_abs_step_conf": 0.7595534324645996, "adv/ratio_final_to_reasoning": 1.341451863064805, "adv/ratio_step_to_reasoning": 1.4473281790766497, "adv/std_final_conf": 0.8915676474571228, "adv/std_reasoning": 0.7928330898284912, "adv/std_step_conf": 0.9346292018890381, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6952564809707666, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.2584027100271004, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.6422764227642277, "calib/gap": 0.22656881779738924, "calib/mean_conf": 0.7620579945799457, "calib/mu_c": 0.8523171171171171, "calib/mu_w": 0.6257482993197279, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.20941734417344188, "calib/std_conf": 0.3406476325655169, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5421875, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.10437039855072472, "calib/step_q_w": 0.4378171014492753, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 529.484375, "completions/mean_terminated_length": 529.484375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0864, "grad_norm": 0.03870345279574394, "kl": 0.07418060302734375, "learning_rate": 3.3055555555555558e-06, "loss": 0.0557, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.035466842353343964, "mask/share_reasoning": 0.8551498055458069, "mask/share_step_conf": 0.10938338935375214, "num_tokens": 19563952.0, "reward": 0.8947978019714355, "reward_std": 0.23964188992977142, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8084914684295654, "rewards/final_brier_reward_step": 0.6787604093551636, "rewards/format_reward_step": 0.93359375, "step": 81 }, { "adv/mean_abs_final_conf": 0.6892867088317871, "adv/mean_abs_reasoning": 0.49717363715171814, "adv/mean_abs_step_conf": 0.7668944597244263, "adv/ratio_final_to_reasoning": 1.3864104154449435, "adv/ratio_step_to_reasoning": 1.5425082957292842, "adv/std_final_conf": 0.8581146001815796, "adv/std_reasoning": 0.7393258810043335, "adv/std_step_conf": 0.9347033500671387, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6737952575216726, "calib/avg_num_step_conf": 5.015625, "calib/ece": 0.2896194225721785, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7401574803149606, "calib/gap": 0.1977885432602413, "calib/mean_conf": 0.8453412073490814, "calib/mu_c": 0.9278828828828828, "calib/mu_w": 0.7300943396226415, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2761417322834646, "calib/std_conf": 0.2911954452589784, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5398409090909091, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.06357194357366774, "calib/step_q_w": 0.47626896551724135, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 454.90234375, "completions/mean_terminated_length": 454.90234375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.08746666666666666, "grad_norm": 0.02759123221039772, "kl": 0.09171295166015625, "learning_rate": 3.277777777777778e-06, "loss": 0.0137, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037247899919748306, "mask/share_reasoning": 0.8433917760848999, "mask/share_step_conf": 0.11936035752296448, "num_tokens": 19785959.0, "reward": 0.908469557762146, "reward_std": 0.21961763501167297, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8281220197677612, "rewards/final_brier_reward_step": 0.6786607503890991, "rewards/format_reward_step": 0.97265625, "step": 82 }, { "adv/mean_abs_final_conf": 0.6679799556732178, "adv/mean_abs_reasoning": 0.5168710350990295, "adv/mean_abs_step_conf": 0.7405879497528076, "adv/ratio_final_to_reasoning": 1.2923532376799498, "adv/ratio_step_to_reasoning": 1.4328292735748196, "adv/std_final_conf": 0.8723222613334656, "adv/std_reasoning": 0.7929161190986633, "adv/std_step_conf": 0.9351591467857361, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7158919511860689, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.2495102040816326, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.6775510204081633, "calib/gap": 0.2795797339914986, "calib/mean_conf": 0.7936734693877551, "calib/mu_c": 0.91006993006993, "calib/mu_w": 0.6304901960784314, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.22975510204081628, "calib/std_conf": 0.3341653866316157, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.48273054054054054, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.04936725929054059, "calib/step_q_w": 0.43336328124999995, "calib/step_q_w_n": 640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 554.6796875, "completions/mean_terminated_length": 556.8549194335938, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.08853333333333334, "grad_norm": 0.03267595171928406, "kl": 0.07877349853515625, "learning_rate": 3.2500000000000002e-06, "loss": -0.0906, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03222406283020973, "mask/share_reasoning": 0.8548700213432312, "mask/share_step_conf": 0.10899969935417175, "num_tokens": 20035221.0, "reward": 0.8798561096191406, "reward_std": 0.2699333727359772, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.7940636277198792, "rewards/final_brier_reward_step": 0.6711171865463257, "rewards/format_reward_step": 0.91015625, "step": 83 }, { "adv/mean_abs_final_conf": 0.711301326751709, "adv/mean_abs_reasoning": 0.5254456996917725, "adv/mean_abs_step_conf": 0.7322558164596558, "adv/ratio_final_to_reasoning": 1.3537104351010196, "adv/ratio_step_to_reasoning": 1.393589893093042, "adv/std_final_conf": 0.8830074071884155, "adv/std_reasoning": 0.792775571346283, "adv/std_step_conf": 0.934762716293335, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7625246548323472, "calib/avg_num_step_conf": 4.6953125, "calib/ece": 0.29368421052631577, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.680161943319838, "calib/gap": 0.274820512820513, "calib/mean_conf": 0.785668016194332, "calib/mu_c": 0.9158461538461539, "calib/mu_w": 0.6410256410256409, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.27651821862348175, "calib/std_conf": 0.3377123849800877, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.49211437403400315, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.05764590556553467, "calib/step_q_w": 0.4344684684684685, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 498.296875, "completions/mean_terminated_length": 498.296875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0896, "grad_norm": 0.041336141526699066, "kl": 0.08129119873046875, "learning_rate": 3.2222222222222227e-06, "loss": -0.0067, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.036794595420360565, "mask/share_reasoning": 0.8531485795974731, "mask/share_step_conf": 0.11005677282810211, "num_tokens": 20268705.0, "reward": 0.8911169767379761, "reward_std": 0.24814923107624054, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.8201553225517273, "rewards/final_brier_reward_step": 0.6706722974777222, "rewards/format_reward_step": 0.94921875, "step": 84 }, { "adv/mean_abs_final_conf": 0.7375852465629578, "adv/mean_abs_reasoning": 0.6083944439888, "adv/mean_abs_step_conf": 0.7445007562637329, "adv/ratio_final_to_reasoning": 1.212347111073447, "adv/ratio_step_to_reasoning": 1.2237139303616626, "adv/std_final_conf": 0.8906832933425903, "adv/std_reasoning": 0.8099904656410217, "adv/std_step_conf": 0.9348547458648682, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7101584022038567, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.31475795297372056, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6473029045643154, "calib/gap": 0.2235241046831956, "calib/mean_conf": 0.7624757952973721, "calib/mu_c": 0.8737741046831956, "calib/mu_w": 0.65025, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.2875795297372061, "calib/std_conf": 0.3512132715214306, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.46668918918918917, "calib/step_q_c_n": 592.0, "calib/step_q_gap": 0.060110920312577754, "calib/step_q_w": 0.4065782688766114, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 548.95703125, "completions/mean_terminated_length": 553.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.09066666666666667, "grad_norm": 0.03381429985165596, "kl": 0.07662200927734375, "learning_rate": 3.1944444444444443e-06, "loss": -0.0631, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.033809252083301544, "mask/share_reasoning": 0.844788670539856, "mask/share_step_conf": 0.11358959972858429, "num_tokens": 20517062.0, "reward": 0.848590612411499, "reward_std": 0.26348742842674255, "rewards/accuracy_reward_step": 0.47265625, "rewards/asymmetric_l2_reward": 0.8029472827911377, "rewards/final_brier_reward_step": 0.6153277158737183, "rewards/format_reward_step": 0.921875, "step": 85 }, { "adv/mean_abs_final_conf": 0.7561108469963074, "adv/mean_abs_reasoning": 0.4853861927986145, "adv/mean_abs_step_conf": 0.7464814782142639, "adv/ratio_final_to_reasoning": 1.5577510407470858, "adv/ratio_step_to_reasoning": 1.5379124690594097, "adv/std_final_conf": 0.9270208477973938, "adv/std_reasoning": 0.7754190564155579, "adv/std_step_conf": 0.9350854158401489, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6685927067283, "calib/avg_num_step_conf": 4.6015625, "calib/ece": 0.31136, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.568, "calib/gap": 0.23257960965588087, "calib/mean_conf": 0.69152, "calib/mu_c": 0.814322033898305, "calib/mu_w": 0.5817424242424242, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.26544, "calib/std_conf": 0.3847857710466956, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.46849652777777784, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.058679252030269524, "calib/step_q_w": 0.4098172757475083, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 488.9609375, "completions/mean_terminated_length": 490.8784484863281, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.09173333333333333, "grad_norm": 0.044561564922332764, "kl": 0.08454132080078125, "learning_rate": 3.1666666666666667e-06, "loss": -0.0951, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03566384315490723, "mask/share_reasoning": 0.8496521711349487, "mask/share_step_conf": 0.11077769100666046, "num_tokens": 20747748.0, "reward": 0.8705247640609741, "reward_std": 0.25272929668426514, "rewards/accuracy_reward_step": 0.4609375, "rewards/asymmetric_l2_reward": 0.8211120367050171, "rewards/final_brier_reward_step": 0.6363437175750732, "rewards/format_reward_step": 0.95703125, "step": 86 }, { "adv/mean_abs_final_conf": 0.7711803913116455, "adv/mean_abs_reasoning": 0.6064971685409546, "adv/mean_abs_step_conf": 0.7583389282226562, "adv/ratio_final_to_reasoning": 1.271531725641635, "adv/ratio_step_to_reasoning": 1.2503585631685408, "adv/std_final_conf": 0.9249719381332397, "adv/std_reasoning": 0.8746480345726013, "adv/std_step_conf": 0.9346683621406555, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6656976744186047, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.2269611780455154, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5943775100401606, "calib/gap": 0.2687181616832781, "calib/mean_conf": 0.6965729585006694, "calib/mu_c": 0.779670542635659, "calib/mu_w": 0.5109523809523809, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1163855421686747, "calib/std_conf": 0.389896312445251, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4399458997722096, "calib/step_q_c_n": 878.0, "calib/step_q_gap": 0.051287529728156755, "calib/step_q_w": 0.38865837004405285, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 489.95703125, "completions/mean_terminated_length": 491.8784484863281, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.0928, "grad_norm": 0.04757794737815857, "kl": 0.0927886962890625, "learning_rate": 3.138888888888889e-06, "loss": -0.0186, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.036881767213344574, "mask/share_reasoning": 0.8420805931091309, "mask/share_step_conf": 0.11713138222694397, "num_tokens": 20978673.0, "reward": 0.947090744972229, "reward_std": 0.24253800511360168, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.8506828546524048, "rewards/final_brier_reward_step": 0.7169361114501953, "rewards/format_reward_step": 0.9609375, "step": 87 }, { "adv/mean_abs_final_conf": 0.6935921907424927, "adv/mean_abs_reasoning": 0.47695034742355347, "adv/mean_abs_step_conf": 0.7630202770233154, "adv/ratio_final_to_reasoning": 1.4542230538026035, "adv/ratio_step_to_reasoning": 1.5997897499081155, "adv/std_final_conf": 0.8939658999443054, "adv/std_reasoning": 0.7392098903656006, "adv/std_step_conf": 0.9342235922813416, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7492699596147873, "calib/avg_num_step_conf": 5.25390625, "calib/ece": 0.21371054687500007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.51171875, "calib/gap": 0.3207307300403851, "calib/mean_conf": 0.664726953125, "calib/mu_c": 0.8037937931034482, "calib/mu_w": 0.4830630630630631, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15601562500000005, "calib/std_conf": 0.3798028370295785, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4338300492610837, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.04416776033050207, "calib/step_q_w": 0.38966228893058164, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 505.84375, "completions/mean_terminated_length": 507.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.09386666666666667, "grad_norm": 0.04795219376683235, "kl": 0.08171844482421875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0578, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03347531333565712, "mask/share_reasoning": 0.8500121831893921, "mask/share_step_conf": 0.11260630190372467, "num_tokens": 21218017.0, "reward": 0.950875461101532, "reward_std": 0.1858539879322052, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.8525054454803467, "rewards/final_brier_reward_step": 0.740651547908783, "rewards/format_reward_step": 0.9765625, "step": 88 }, { "adv/mean_abs_final_conf": 0.7229398488998413, "adv/mean_abs_reasoning": 0.47888630628585815, "adv/mean_abs_step_conf": 0.743857204914093, "adv/ratio_final_to_reasoning": 1.5096273153158446, "adv/ratio_step_to_reasoning": 1.5533064845459743, "adv/std_final_conf": 0.891949474811554, "adv/std_reasoning": 0.75757896900177, "adv/std_step_conf": 0.9348465204238892, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7704152467499685, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.1756086956521739, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.43478260869565216, "calib/gap": 0.37849211157389884, "calib/mean_conf": 0.5794901185770751, "calib/mu_c": 0.7500359712230216, "calib/mu_w": 0.37154385964912273, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1028458498023715, "calib/std_conf": 0.4025525435239658, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.44734042553191494, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.09799409167856082, "calib/step_q_w": 0.3493463338533541, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 525.1953125, "completions/mean_terminated_length": 525.1953125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.09493333333333333, "grad_norm": 0.033446215093135834, "kl": 0.0828399658203125, "learning_rate": 3.0833333333333336e-06, "loss": -0.0353, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03558123856782913, "mask/share_reasoning": 0.8521634936332703, "mask/share_step_conf": 0.11225523054599762, "num_tokens": 21461355.0, "reward": 0.9454695582389832, "reward_std": 0.22363021969795227, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8446345329284668, "rewards/final_brier_reward_step": 0.7455234527587891, "rewards/format_reward_step": 0.9609375, "step": 89 }, { "adv/mean_abs_final_conf": 0.7110995054244995, "adv/mean_abs_reasoning": 0.510982096195221, "adv/mean_abs_step_conf": 0.7581682205200195, "adv/ratio_final_to_reasoning": 1.3916329177075974, "adv/ratio_step_to_reasoning": 1.4837471335401955, "adv/std_final_conf": 0.893814206123352, "adv/std_reasoning": 0.7575870156288147, "adv/std_step_conf": 0.933884859085083, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7209821428571428, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.21139442231075695, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.545816733067729, "calib/gap": 0.3062506868131869, "calib/mean_conf": 0.678406374501992, "calib/mu_c": 0.7894375, "calib/mu_w": 0.4831868131868131, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12617529880478087, "calib/std_conf": 0.3872137809409691, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4215166484118291, "calib/step_q_c_n": 913.0, "calib/step_q_gap": 0.06626083445834074, "calib/step_q_w": 0.3552558139534884, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 533.96484375, "completions/mean_terminated_length": 533.96484375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.096, "grad_norm": 0.029829688370227814, "kl": 0.0811309814453125, "learning_rate": 3.055555555555556e-06, "loss": 0.0606, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03403983637690544, "mask/share_reasoning": 0.8410810828208923, "mask/share_step_conf": 0.12487903982400894, "num_tokens": 21701370.0, "reward": 0.9619349241256714, "reward_std": 0.1955302506685257, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.8681378364562988, "rewards/final_brier_reward_step": 0.736200749874115, "rewards/format_reward_step": 0.97265625, "step": 90 }, { "adv/mean_abs_final_conf": 0.7319117784500122, "adv/mean_abs_reasoning": 0.5304526090621948, "adv/mean_abs_step_conf": 0.7541947364807129, "adv/ratio_final_to_reasoning": 1.3797873098295885, "adv/ratio_step_to_reasoning": 1.4217947533787785, "adv/std_final_conf": 0.8858980536460876, "adv/std_reasoning": 0.7927713394165039, "adv/std_step_conf": 0.9336501359939575, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7143346346140675, "calib/avg_num_step_conf": 6.0, "calib/ece": 0.23464520000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.28583001849188405, "calib/mean_conf": 0.6415948000000001, "calib/mu_c": 0.7479235668789809, "calib/mu_w": 0.4620935483870968, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12412000000000004, "calib/std_conf": 0.4015845626676404, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4017911318553092, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.06388833362261404, "calib/step_q_w": 0.33790279823269515, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 534.22265625, "completions/mean_terminated_length": 540.5573120117188, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.09706666666666666, "grad_norm": 0.04101370647549629, "kl": 0.0859375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0541, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03051559254527092, "mask/share_reasoning": 0.8473619222640991, "mask/share_step_conf": 0.11040370911359787, "num_tokens": 21945843.0, "reward": 0.9472370147705078, "reward_std": 0.20020480453968048, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8600229024887085, "rewards/final_brier_reward_step": 0.7172636389732361, "rewards/format_reward_step": 0.97265625, "step": 91 }, { "adv/mean_abs_final_conf": 0.6581138372421265, "adv/mean_abs_reasoning": 0.5041226744651794, "adv/mean_abs_step_conf": 0.7552310228347778, "adv/ratio_final_to_reasoning": 1.3054636710009428, "adv/ratio_step_to_reasoning": 1.4981096092057309, "adv/std_final_conf": 0.8489682674407959, "adv/std_reasoning": 0.7752818465232849, "adv/std_step_conf": 0.933874785900116, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7361003611971104, "calib/avg_num_step_conf": 5.03515625, "calib/ece": 0.21250656167979007, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5984251968503937, "calib/gap": 0.2983836429308565, "calib/mean_conf": 0.7195406824146982, "calib/mu_c": 0.8393640350877193, "calib/mu_w": 0.5409803921568628, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1668110236220473, "calib/std_conf": 0.36532456459468643, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4126775956284153, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.06470632094259843, "calib/step_q_w": 0.34797127468581684, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 476.046875, "completions/mean_terminated_length": 476.046875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.09813333333333334, "grad_norm": 0.030025403946638107, "kl": 0.0915374755859375, "learning_rate": 3e-06, "loss": -0.0822, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03648817166686058, "mask/share_reasoning": 0.8472324013710022, "mask/share_step_conf": 0.11627940833568573, "num_tokens": 22174431.0, "reward": 0.9668546915054321, "reward_std": 0.18710312247276306, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.870896577835083, "rewards/final_brier_reward_step": 0.7471877336502075, "rewards/format_reward_step": 0.984375, "step": 92 }, { "adv/mean_abs_final_conf": 0.6915310025215149, "adv/mean_abs_reasoning": 0.5947073698043823, "adv/mean_abs_step_conf": 0.7633007764816284, "adv/ratio_final_to_reasoning": 1.1628088663992526, "adv/ratio_step_to_reasoning": 1.28348968793291, "adv/std_final_conf": 0.8915781378746033, "adv/std_reasoning": 0.8265567421913147, "adv/std_step_conf": 0.9339955449104309, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6820295846521542, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.27851960000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.504, "calib/gap": 0.23995874297526, "calib/mean_conf": 0.6413204, "calib/mu_c": 0.7497817518248175, "calib/mu_w": 0.5098230088495576, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18592000000000006, "calib/std_conf": 0.40051339126655927, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4214962078651686, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.1148526893466501, "calib/step_q_w": 0.3066435185185185, "calib/step_q_w_n": 864.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 537.7578125, "completions/mean_terminated_length": 539.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.0992, "grad_norm": 0.027518408372998238, "kl": 0.081146240234375, "learning_rate": 2.9722222222222225e-06, "loss": 0.0065, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03318294137716293, "mask/share_reasoning": 0.8384314775466919, "mask/share_step_conf": 0.12447934597730637, "num_tokens": 22417873.0, "reward": 0.9200072288513184, "reward_std": 0.20049725472927094, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.8553484678268433, "rewards/final_brier_reward_step": 0.6831035017967224, "rewards/format_reward_step": 0.97265625, "step": 93 }, { "adv/mean_abs_final_conf": 0.6092467904090881, "adv/mean_abs_reasoning": 0.5352387428283691, "adv/mean_abs_step_conf": 0.7489203214645386, "adv/ratio_final_to_reasoning": 1.1382710959779132, "adv/ratio_step_to_reasoning": 1.3992266656688734, "adv/std_final_conf": 0.83155757188797, "adv/std_reasoning": 0.7928059697151184, "adv/std_step_conf": 0.9342983961105347, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7352882966090515, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.1942570281124499, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6144578313253012, "calib/gap": 0.34813497822931794, "calib/mean_conf": 0.7179518072289156, "calib/mu_c": 0.8661538461538462, "calib/mu_w": 0.5180188679245282, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1689558232931728, "calib/std_conf": 0.37905852205075696, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4239228723404256, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.09478573828803727, "calib/step_q_w": 0.3291371340523883, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 487.31640625, "completions/mean_terminated_length": 491.1535339355469, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.10026666666666667, "grad_norm": 0.04939788579940796, "kl": 0.08742523193359375, "learning_rate": 2.944444444444445e-06, "loss": -0.0123, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03488625958561897, "mask/share_reasoning": 0.8387618064880371, "mask/share_step_conf": 0.11853942275047302, "num_tokens": 22651306.0, "reward": 0.9398282766342163, "reward_std": 0.20985865592956543, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.8465824127197266, "rewards/final_brier_reward_step": 0.7291679382324219, "rewards/format_reward_step": 0.9609375, "step": 94 }, { "adv/mean_abs_final_conf": 0.5665971636772156, "adv/mean_abs_reasoning": 0.4219573140144348, "adv/mean_abs_step_conf": 0.7465494871139526, "adv/ratio_final_to_reasoning": 1.342783131987215, "adv/ratio_step_to_reasoning": 1.7692535768876703, "adv/std_final_conf": 0.799198567867279, "adv/std_reasoning": 0.7206243276596069, "adv/std_step_conf": 0.9329851865768433, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7522349936143038, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.1738554216867471, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6506024096385542, "calib/gap": 0.3717943805874839, "calib/mean_conf": 0.7402811244979919, "calib/mu_c": 0.870185185185185, "calib/mu_w": 0.4983908045977011, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13176706827309248, "calib/std_conf": 0.3756979065059484, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.3911706315789474, "calib/step_q_c_n": 950.0, "calib/step_q_gap": 0.0785217636544191, "calib/step_q_w": 0.3126488679245283, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 511.7265625, "completions/mean_terminated_length": 511.7265625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.10133333333333333, "grad_norm": 0.02649582363665104, "kl": 0.07971954345703125, "learning_rate": 2.916666666666667e-06, "loss": -0.0095, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03568973019719124, "mask/share_reasoning": 0.8352484107017517, "mask/share_step_conf": 0.12906186282634735, "num_tokens": 22888436.0, "reward": 0.9772668480873108, "reward_std": 0.18089798092842102, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.867957592010498, "rewards/final_brier_reward_step": 0.7670449018478394, "rewards/format_reward_step": 0.96484375, "step": 95 }, { "adv/mean_abs_final_conf": 0.6147359609603882, "adv/mean_abs_reasoning": 0.4808635711669922, "adv/mean_abs_step_conf": 0.7362314462661743, "adv/ratio_final_to_reasoning": 1.2783999408990485, "adv/ratio_step_to_reasoning": 1.5310609711595289, "adv/std_final_conf": 0.8461417555809021, "adv/std_reasoning": 0.7393701672554016, "adv/std_step_conf": 0.9344803690910339, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7698014629049112, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.17388888888888893, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6746031746031746, "calib/gap": 0.4222988505747126, "calib/mean_conf": 0.7508730158730158, "calib/mu_c": 0.8966666666666666, "calib/mu_w": 0.474367816091954, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13500000000000006, "calib/std_conf": 0.37173556876008074, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.415, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.07909174311926603, "calib/step_q_w": 0.33590825688073395, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 462.59765625, "completions/mean_terminated_length": 462.59765625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.1024, "grad_norm": 0.026068033650517464, "kl": 0.093292236328125, "learning_rate": 2.888888888888889e-06, "loss": 0.0366, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03652406111359596, "mask/share_reasoning": 0.8425166606903076, "mask/share_step_conf": 0.12095930427312851, "num_tokens": 23112677.0, "reward": 0.9878829717636108, "reward_std": 0.2013242542743683, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.849997878074646, "rewards/final_brier_reward_step": 0.8007679581642151, "rewards/format_reward_step": 0.98046875, "step": 96 }, { "adv/mean_abs_final_conf": 0.7184832096099854, "adv/mean_abs_reasoning": 0.59392249584198, "adv/mean_abs_step_conf": 0.7223066091537476, "adv/ratio_final_to_reasoning": 1.2097255359748929, "adv/ratio_step_to_reasoning": 1.2161630755032482, "adv/std_final_conf": 0.9094932079315186, "adv/std_reasoning": 0.8265097737312317, "adv/std_step_conf": 0.934262752532959, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6376647834274953, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.28857312252964423, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5928853754940712, "calib/gap": 0.2069691148775895, "calib/mean_conf": 0.7016245059288537, "calib/mu_c": 0.7981555555555556, "calib/mu_w": 0.5911864406779661, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22830039525691698, "calib/std_conf": 0.3787427366765338, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40408163265306124, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.07610209301111748, "calib/step_q_w": 0.32797953964194376, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 460.6875, "completions/mean_terminated_length": 462.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.10346666666666667, "grad_norm": 0.0355035662651062, "kl": 0.0928802490234375, "learning_rate": 2.861111111111111e-06, "loss": -0.0871, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03594241291284561, "mask/share_reasoning": 0.8263546228408813, "mask/share_step_conf": 0.13379667699337006, "num_tokens": 23335685.0, "reward": 0.9316617250442505, "reward_std": 0.1955932229757309, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8849480152130127, "rewards/final_brier_reward_step": 0.6744691133499146, "rewards/format_reward_step": 0.98828125, "step": 97 }, { "adv/mean_abs_final_conf": 0.6633474826812744, "adv/mean_abs_reasoning": 0.5149575471878052, "adv/mean_abs_step_conf": 0.7673736214637756, "adv/ratio_final_to_reasoning": 1.2881595508286654, "adv/ratio_step_to_reasoning": 1.490168705467898, "adv/std_final_conf": 0.8635463714599609, "adv/std_reasoning": 0.7928749918937683, "adv/std_step_conf": 0.9342027306556702, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7269079083927212, "calib/avg_num_step_conf": 4.8359375, "calib/ece": 0.24089430894308933, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6382113821138211, "calib/gap": 0.33125164325745526, "calib/mean_conf": 0.718780487804878, "calib/mu_c": 0.8493959731543624, "calib/mu_w": 0.5181443298969072, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1769918699186991, "calib/std_conf": 0.3954385963986233, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.410972602739726, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.04305724840901737, "calib/step_q_w": 0.36791535433070865, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 510.96875, "completions/mean_terminated_length": 512.9725952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.10453333333333334, "grad_norm": 0.054965581744909286, "kl": 0.08089447021484375, "learning_rate": 2.8333333333333335e-06, "loss": -0.0625, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.035400211811065674, "mask/share_reasoning": 0.8491687774658203, "mask/share_step_conf": 0.11152474582195282, "num_tokens": 23572677.0, "reward": 0.932797372341156, "reward_std": 0.24011409282684326, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8460032343864441, "rewards/final_brier_reward_step": 0.7133413553237915, "rewards/format_reward_step": 0.94921875, "step": 98 }, { "adv/mean_abs_final_conf": 0.7435926198959351, "adv/mean_abs_reasoning": 0.6365935802459717, "adv/mean_abs_step_conf": 0.7237546443939209, "adv/ratio_final_to_reasoning": 1.1680806137074462, "adv/ratio_step_to_reasoning": 1.1369179125467639, "adv/std_final_conf": 0.9068244099617004, "adv/std_reasoning": 0.859096884727478, "adv/std_step_conf": 0.9345636367797852, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6776324614352783, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.2825910931174089, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4534412955465587, "calib/gap": 0.30777598926894706, "calib/mean_conf": 0.5718218623481782, "calib/mu_c": 0.7487619047619049, "calib/mu_w": 0.4409859154929578, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2146558704453441, "calib/std_conf": 0.41920147804862407, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3843027210884354, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.06500141104476725, "calib/step_q_w": 0.31930131004366813, "calib/step_q_w_n": 916.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 586.73828125, "completions/mean_terminated_length": 596.0516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.1056, "grad_norm": 0.060040753334760666, "kl": 0.07159423828125, "learning_rate": 2.805555555555556e-06, "loss": -0.0674, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031188489869236946, "mask/share_reasoning": 0.8459464311599731, "mask/share_step_conf": 0.10724010318517685, "num_tokens": 23828682.0, "reward": 0.8999078273773193, "reward_std": 0.24172960221767426, "rewards/accuracy_reward_step": 0.41015625, "rewards/asymmetric_l2_reward": 0.8456334471702576, "rewards/final_brier_reward_step": 0.6799633502960205, "rewards/format_reward_step": 0.9609375, "step": 99 }, { "adv/mean_abs_final_conf": 0.7108124494552612, "adv/mean_abs_reasoning": 0.5766913294792175, "adv/mean_abs_step_conf": 0.7223371863365173, "adv/ratio_final_to_reasoning": 1.232570030309216, "adv/ratio_step_to_reasoning": 1.2525542684833249, "adv/std_final_conf": 0.9088829755783081, "adv/std_reasoning": 0.8098970055580139, "adv/std_step_conf": 0.9337549805641174, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7749548037190084, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.19120481927710847, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5261044176706827, "calib/gap": 0.4201213842975208, "calib/mean_conf": 0.6380321285140562, "calib/mu_c": 0.8421875000000001, "calib/mu_w": 0.4220661157024793, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15759036144578317, "calib/std_conf": 0.4125205884747448, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.39832669322709163, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.06741885278692661, "calib/step_q_w": 0.330907840440165, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 584.765625, "completions/mean_terminated_length": 584.765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.10666666666666667, "grad_norm": 0.024518176913261414, "kl": 0.07038116455078125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0789, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03139622509479523, "mask/share_reasoning": 0.8543316721916199, "mask/share_step_conf": 0.1142721176147461, "num_tokens": 24085790.0, "reward": 0.9550304412841797, "reward_std": 0.22531947493553162, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.8710557818412781, "rewards/final_brier_reward_step": 0.7460362911224365, "rewards/format_reward_step": 0.96484375, "step": 100 }, { "adv/mean_abs_final_conf": 0.6849584579467773, "adv/mean_abs_reasoning": 0.5034070014953613, "adv/mean_abs_step_conf": 0.7498930096626282, "adv/ratio_final_to_reasoning": 1.3606454735673534, "adv/ratio_step_to_reasoning": 1.4896356376353221, "adv/std_final_conf": 0.8788121938705444, "adv/std_reasoning": 0.7753551006317139, "adv/std_step_conf": 0.9335606098175049, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6994607347489045, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.23389344262295084, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4098360655737705, "calib/gap": 0.30889046174587115, "calib/mean_conf": 0.5250409836065575, "calib/mu_c": 0.6883478260869564, "calib/mu_w": 0.3794573643410853, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1438114754098361, "calib/std_conf": 0.4189100787257916, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3757354925775978, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.07503650723374206, "calib/step_q_w": 0.30069898534385575, "calib/step_q_w_n": 887.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 583.92578125, "completions/mean_terminated_length": 586.2156982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.10773333333333333, "grad_norm": 0.034379322081804276, "kl": 0.0738525390625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0217, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030435508117079735, "mask/share_reasoning": 0.8451032638549805, "mask/share_step_conf": 0.12055499106645584, "num_tokens": 24342267.0, "reward": 0.9144086837768555, "reward_std": 0.21262916922569275, "rewards/accuracy_reward_step": 0.453125, "rewards/asymmetric_l2_reward": 0.855229377746582, "rewards/final_brier_reward_step": 0.6923378705978394, "rewards/format_reward_step": 0.953125, "step": 101 }, { "adv/mean_abs_final_conf": 0.5878695845603943, "adv/mean_abs_reasoning": 0.3899923264980316, "adv/mean_abs_step_conf": 0.7400535941123962, "adv/ratio_final_to_reasoning": 1.5073875679534976, "adv/ratio_step_to_reasoning": 1.8976106549525442, "adv/std_final_conf": 0.8100722432136536, "adv/std_reasoning": 0.681530773639679, "adv/std_step_conf": 0.9334313869476318, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7806785051683011, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.17003968253968255, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5595238095238095, "calib/gap": 0.40557513914656784, "calib/mean_conf": 0.6737698412698412, "calib/mu_c": 0.8314935064935066, "calib/mu_w": 0.4259183673469388, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11634920634920634, "calib/std_conf": 0.38713599724165704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40686440677966107, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.0938722807954091, "calib/step_q_w": 0.31299212598425197, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 480.0234375, "completions/mean_terminated_length": 480.0234375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1088, "grad_norm": 0.06203492358326912, "kl": 0.08522796630859375, "learning_rate": 2.7222222222222224e-06, "loss": 0.0534, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03778192400932312, "mask/share_reasoning": 0.8289196491241455, "mask/share_step_conf": 0.13329845666885376, "num_tokens": 24571849.0, "reward": 0.9879881143569946, "reward_std": 0.15715520083904266, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.8699907064437866, "rewards/final_brier_reward_step": 0.7887980341911316, "rewards/format_reward_step": 0.984375, "step": 102 }, { "adv/mean_abs_final_conf": 0.6246525049209595, "adv/mean_abs_reasoning": 0.3913930654525757, "adv/mean_abs_step_conf": 0.733914852142334, "adv/ratio_final_to_reasoning": 1.595972335888632, "adv/ratio_step_to_reasoning": 1.8751350417864288, "adv/std_final_conf": 0.8219296932220459, "adv/std_reasoning": 0.6816326975822449, "adv/std_step_conf": 0.9333863854408264, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7566779346457214, "calib/avg_num_step_conf": 5.890625, "calib/ece": 0.19665322580645156, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5806451612903226, "calib/gap": 0.37074898919599664, "calib/mean_conf": 0.7041532258064516, "calib/mu_c": 0.8641134751773051, "calib/mu_w": 0.49336448598130844, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16612903225806447, "calib/std_conf": 0.38403745500202563, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4069795918367347, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.09437933310452257, "calib/step_q_w": 0.31260025873221214, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 587.85546875, "completions/mean_terminated_length": 590.1608276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.10986666666666667, "grad_norm": 0.0305598396807909, "kl": 0.06987762451171875, "learning_rate": 2.6944444444444444e-06, "loss": 0.0375, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03288574516773224, "mask/share_reasoning": 0.8510973453521729, "mask/share_step_conf": 0.11211065948009491, "num_tokens": 24826892.0, "reward": 0.958093523979187, "reward_std": 0.1841270923614502, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.8656498193740845, "rewards/final_brier_reward_step": 0.7466309070587158, "rewards/format_reward_step": 0.96875, "step": 103 }, { "adv/mean_abs_final_conf": 0.6745511293411255, "adv/mean_abs_reasoning": 0.437224805355072, "adv/mean_abs_step_conf": 0.7339380383491516, "adv/ratio_final_to_reasoning": 1.54280160018213, "adv/ratio_step_to_reasoning": 1.678628543851984, "adv/std_final_conf": 0.8667935729026794, "adv/std_reasoning": 0.7206018567085266, "adv/std_step_conf": 0.9330757260322571, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7567415025711777, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.18438735177865614, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.4076044149002886, "calib/mean_conf": 0.5739525691699605, "calib/mu_c": 0.7656716417910449, "calib/mu_w": 0.35806722689075626, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11434782608695651, "calib/std_conf": 0.4134958809953477, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.407767253044655, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.12217361779373703, "calib/step_q_w": 0.28559363525091797, "calib/step_q_w_n": 817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 541.33203125, "completions/mean_terminated_length": 541.33203125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.11093333333333333, "grad_norm": 0.04134015738964081, "kl": 0.07474517822265625, "learning_rate": 2.666666666666667e-06, "loss": -0.0286, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032088808715343475, "mask/share_reasoning": 0.8443004488945007, "mask/share_step_conf": 0.12361074984073639, "num_tokens": 25072153.0, "reward": 0.9815112948417664, "reward_std": 0.16471192240715027, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.8935236930847168, "rewards/final_brier_reward_step": 0.7687175869941711, "rewards/format_reward_step": 0.98046875, "step": 104 }, { "adv/mean_abs_final_conf": 0.7218550443649292, "adv/mean_abs_reasoning": 0.6229248642921448, "adv/mean_abs_step_conf": 0.7113521099090576, "adv/ratio_final_to_reasoning": 1.1588155903604889, "adv/ratio_step_to_reasoning": 1.1419549141249905, "adv/std_final_conf": 0.9065027236938477, "adv/std_reasoning": 0.8430431485176086, "adv/std_step_conf": 0.9336177110671997, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7497893577030267, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.19215999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.496, "calib/gap": 0.3931103765636139, "calib/mean_conf": 0.6092000000000001, "calib/mu_c": 0.7837410071942446, "calib/mu_w": 0.39063063063063064, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12267999999999994, "calib/std_conf": 0.41203417334002773, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3973221216041397, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.08317197996108017, "calib/step_q_w": 0.31415014164305954, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 542.796875, "completions/mean_terminated_length": 542.796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.112, "grad_norm": 0.05769439414143562, "kl": 0.07154083251953125, "learning_rate": 2.6388888888888893e-06, "loss": 0.0324, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03391508758068085, "mask/share_reasoning": 0.8441091179847717, "mask/share_step_conf": 0.12197580933570862, "num_tokens": 25316869.0, "reward": 0.9692540168762207, "reward_std": 0.21710465848445892, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8789149522781372, "rewards/final_brier_reward_step": 0.7564679384231567, "rewards/format_reward_step": 0.97265625, "step": 105 }, { "adv/mean_abs_final_conf": 0.6788108348846436, "adv/mean_abs_reasoning": 0.4150359034538269, "adv/mean_abs_step_conf": 0.7503209114074707, "adv/ratio_final_to_reasoning": 1.6355472604556531, "adv/ratio_step_to_reasoning": 1.807845791565222, "adv/std_final_conf": 0.8545103669166565, "adv/std_reasoning": 0.7012789249420166, "adv/std_step_conf": 0.9325715899467468, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7854966677245319, "calib/avg_num_step_conf": 5.3671875, "calib/ece": 0.18345238095238095, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5436507936507936, "calib/gap": 0.39273310060298333, "calib/mean_conf": 0.6619444444444444, "calib/mu_c": 0.8411678832116789, "calib/mu_w": 0.44843478260869557, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15087301587301588, "calib/std_conf": 0.39241896960675976, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40268258426966297, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.08248620964730641, "calib/step_q_w": 0.32019637462235656, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 481.9765625, "completions/mean_terminated_length": 481.9765625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11306666666666666, "grad_norm": 0.04111357033252716, "kl": 0.0830841064453125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0094, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035246968269348145, "mask/share_reasoning": 0.8441534042358398, "mask/share_step_conf": 0.1205996572971344, "num_tokens": 25544839.0, "reward": 0.9766442775726318, "reward_std": 0.1666399985551834, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.8827617168426514, "rewards/final_brier_reward_step": 0.7666206955909729, "rewards/format_reward_step": 0.984375, "step": 106 }, { "adv/mean_abs_final_conf": 0.6700491905212402, "adv/mean_abs_reasoning": 0.46548759937286377, "adv/mean_abs_step_conf": 0.7599701881408691, "adv/ratio_final_to_reasoning": 1.4394565857908472, "adv/ratio_step_to_reasoning": 1.6326325108654927, "adv/std_final_conf": 0.8719359636306763, "adv/std_reasoning": 0.7206145524978638, "adv/std_step_conf": 0.9324932098388672, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6578612753512619, "calib/avg_num_step_conf": 6.25390625, "calib/ece": 0.2519291338582677, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6653543307086615, "calib/gap": 0.2677951554453557, "calib/mean_conf": 0.7520866141732283, "calib/mu_c": 0.8648979591836735, "calib/mu_w": 0.5971028037383178, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21263779527559057, "calib/std_conf": 0.3685606932704154, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40989394285714287, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.08921901172766628, "calib/step_q_w": 0.3206749311294766, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2140.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 491.32421875, "completions/mean_terminated_length": 491.32421875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.11413333333333334, "grad_norm": 0.03381947800517082, "kl": 0.08031463623046875, "learning_rate": 2.5833333333333337e-06, "loss": 0.0376, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03411445766687393, "mask/share_reasoning": 0.8311688899993896, "mask/share_step_conf": 0.13471662998199463, "num_tokens": 25775234.0, "reward": 0.9557232856750488, "reward_std": 0.1572231650352478, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.8837653398513794, "rewards/final_brier_reward_step": 0.7151812314987183, "rewards/format_reward_step": 0.98828125, "step": 107 }, { "adv/mean_abs_final_conf": 0.5977965593338013, "adv/mean_abs_reasoning": 0.40435507893562317, "adv/mean_abs_step_conf": 0.7389947772026062, "adv/ratio_final_to_reasoning": 1.4783950801542316, "adv/ratio_step_to_reasoning": 1.8275887102688293, "adv/std_final_conf": 0.8289056420326233, "adv/std_reasoning": 0.6816290020942688, "adv/std_step_conf": 0.9333050847053528, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7069702328323018, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.19730158730158717, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7063492063492064, "calib/gap": 0.33805481874447396, "calib/mean_conf": 0.7677777777777778, "calib/mu_c": 0.8724137931034484, "calib/mu_w": 0.5343589743589744, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13730158730158717, "calib/std_conf": 0.37361466629570667, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42418063314711363, "calib/step_q_c_n": 1074.0, "calib/step_q_gap": 0.08563896648044694, "calib/step_q_w": 0.3385416666666667, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 530.125, "completions/mean_terminated_length": 532.2039794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.1152, "grad_norm": 0.053452033549547195, "kl": 0.07540130615234375, "learning_rate": 2.5555555555555557e-06, "loss": -0.0121, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03260500729084015, "mask/share_reasoning": 0.8317328691482544, "mask/share_step_conf": 0.13175587356090546, "num_tokens": 26014178.0, "reward": 0.9928351044654846, "reward_std": 0.16699038445949554, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.8799116611480713, "rewards/final_brier_reward_step": 0.7729461193084717, "rewards/format_reward_step": 0.984375, "step": 108 }, { "adv/mean_abs_final_conf": 0.6117569804191589, "adv/mean_abs_reasoning": 0.3977644145488739, "adv/mean_abs_step_conf": 0.7265390157699585, "adv/ratio_final_to_reasoning": 1.5379882112204168, "adv/ratio_step_to_reasoning": 1.826556095004038, "adv/std_final_conf": 0.8428977131843567, "adv/std_reasoning": 0.701329231262207, "adv/std_step_conf": 0.9332946538925171, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8309944119888238, "calib/avg_num_step_conf": 6.8203125, "calib/ece": 0.15553784860557762, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.549800796812749, "calib/gap": 0.5199441198882397, "calib/mean_conf": 0.6278884462151394, "calib/mu_c": 0.8909677419354839, "calib/mu_w": 0.3710236220472441, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14470119521912345, "calib/std_conf": 0.4216018631150103, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4085305105853051, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.10964397824172079, "calib/step_q_w": 0.2988865323435843, "calib/step_q_w_n": 943.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2384.0, "completions/max_terminated_length": 2384.0, "completions/mean_length": 556.96484375, "completions/mean_terminated_length": 556.96484375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.11626666666666667, "grad_norm": 0.02547648921608925, "kl": 0.07645416259765625, "learning_rate": 2.5277777777777778e-06, "loss": -0.0174, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031178954988718033, "mask/share_reasoning": 0.8346405029296875, "mask/share_step_conf": 0.13418057560920715, "num_tokens": 26261361.0, "reward": 0.9844216108322144, "reward_std": 0.1782597303390503, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.8859130144119263, "rewards/final_brier_reward_step": 0.791523814201355, "rewards/format_reward_step": 0.97265625, "step": 109 }, { "adv/mean_abs_final_conf": 0.7054433226585388, "adv/mean_abs_reasoning": 0.4843199551105499, "adv/mean_abs_step_conf": 0.7825179100036621, "adv/ratio_final_to_reasoning": 1.4565646432997288, "adv/ratio_step_to_reasoning": 1.615704456829672, "adv/std_final_conf": 0.8620911240577698, "adv/std_reasoning": 0.7207316160202026, "adv/std_step_conf": 0.9318588376045227, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6559316569954868, "calib/avg_num_step_conf": 5.3671875, "calib/ece": 0.27661354581673303, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5896414342629482, "calib/gap": 0.260464861379755, "calib/mean_conf": 0.6825896414342629, "calib/mu_c": 0.7967375886524822, "calib/mu_w": 0.5362727272727272, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1987250996015936, "calib/std_conf": 0.4021932734586809, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44165925925925925, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.09314709903036078, "calib/step_q_w": 0.34851216022889847, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 491.0390625, "completions/mean_terminated_length": 491.0390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.11733333333333333, "grad_norm": 0.04176861792802811, "kl": 0.1174468994140625, "learning_rate": 2.5e-06, "loss": 0.0367, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03552088141441345, "mask/share_reasoning": 0.8433677554130554, "mask/share_step_conf": 0.12111136317253113, "num_tokens": 26491987.0, "reward": 0.929660439491272, "reward_std": 0.18326841294765472, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.8658431172370911, "rewards/final_brier_reward_step": 0.6880090236663818, "rewards/format_reward_step": 0.9765625, "step": 110 }, { "adv/mean_abs_final_conf": 0.614783763885498, "adv/mean_abs_reasoning": 0.5257919430732727, "adv/mean_abs_step_conf": 0.7170236110687256, "adv/ratio_final_to_reasoning": 1.169252918354102, "adv/ratio_step_to_reasoning": 1.3637021649242036, "adv/std_final_conf": 0.8417708277702332, "adv/std_reasoning": 0.7755916118621826, "adv/std_step_conf": 0.933610200881958, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6853343013662464, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.2681854838709678, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6370967741935484, "calib/gap": 0.30863111345785776, "calib/mean_conf": 0.7039919354838711, "calib/mu_c": 0.8396402877697843, "calib/mu_w": 0.5310091743119265, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20584677419354847, "calib/std_conf": 0.40234468043015986, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4302653631284916, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.07023695403758251, "calib/step_q_w": 0.3600284090909091, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 529.79296875, "completions/mean_terminated_length": 531.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 0.1184, "grad_norm": 0.03763270750641823, "kl": 0.0719451904296875, "learning_rate": 2.4722222222222226e-06, "loss": -0.0694, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03530872240662575, "mask/share_reasoning": 0.8400354385375977, "mask/share_step_conf": 0.12074960768222809, "num_tokens": 26735022.0, "reward": 0.9260072708129883, "reward_std": 0.21023571491241455, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8536633253097534, "rewards/final_brier_reward_step": 0.6967886686325073, "rewards/format_reward_step": 0.96484375, "step": 111 }, { "adv/mean_abs_final_conf": 0.6220129728317261, "adv/mean_abs_reasoning": 0.4993098974227905, "adv/mean_abs_step_conf": 0.7455708384513855, "adv/ratio_final_to_reasoning": 1.2457453297887198, "adv/ratio_step_to_reasoning": 1.4932026028317913, "adv/std_final_conf": 0.8272049427032471, "adv/std_reasoning": 0.7575883865356445, "adv/std_step_conf": 0.9338021874427795, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8360327743902439, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.16860557768924303, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.5284616361788618, "calib/mean_conf": 0.5791235059760956, "calib/mu_c": 0.8486178861788618, "calib/mu_w": 0.32015625000000003, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1288446215139442, "calib/std_conf": 0.4347972333728267, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45525581395348835, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.12670453190220632, "calib/step_q_w": 0.32855128205128203, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 565.5390625, "completions/mean_terminated_length": 567.7568969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.11946666666666667, "grad_norm": 0.0486106239259243, "kl": 0.07489013671875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0159, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030696198344230652, "mask/share_reasoning": 0.8548277020454407, "mask/share_step_conf": 0.11056986451148987, "num_tokens": 26987720.0, "reward": 0.973731517791748, "reward_std": 0.18799945712089539, "rewards/accuracy_reward_step": 0.48046875, "rewards/asymmetric_l2_reward": 0.8586279153823853, "rewards/final_brier_reward_step": 0.7974288463592529, "rewards/format_reward_step": 0.9765625, "step": 112 }, { "adv/mean_abs_final_conf": 0.6730247139930725, "adv/mean_abs_reasoning": 0.5346618890762329, "adv/mean_abs_step_conf": 0.730544924736023, "adv/ratio_final_to_reasoning": 1.258785650789319, "adv/ratio_step_to_reasoning": 1.3663680536464433, "adv/std_final_conf": 0.8757805824279785, "adv/std_reasoning": 0.8097303509712219, "adv/std_step_conf": 0.933262050151825, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7831081081081082, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.1924557768924303, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.4057848133848132, "calib/mean_conf": 0.7248350597609562, "calib/mu_c": 0.9042857142857142, "calib/mu_w": 0.49850090090090104, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1797609561752988, "calib/std_conf": 0.3822900440956289, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40888641425389755, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.06416038685663722, "calib/step_q_w": 0.3447260273972603, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 490.23828125, "completions/mean_terminated_length": 490.23828125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.12053333333333334, "grad_norm": 0.041349541395902634, "kl": 0.0941314697265625, "learning_rate": 2.4166666666666667e-06, "loss": 0.0675, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035144634544849396, "mask/share_reasoning": 0.8297086954116821, "mask/share_step_conf": 0.1351466178894043, "num_tokens": 27218421.0, "reward": 0.982533872127533, "reward_std": 0.20815324783325195, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.894577145576477, "rewards/final_brier_reward_step": 0.7642405033111572, "rewards/format_reward_step": 0.98046875, "step": 113 }, { "adv/mean_abs_final_conf": 0.5990077257156372, "adv/mean_abs_reasoning": 0.44762060046195984, "adv/mean_abs_step_conf": 0.7410391569137573, "adv/ratio_final_to_reasoning": 1.3382041065523809, "adv/ratio_step_to_reasoning": 1.6555072669778368, "adv/std_final_conf": 0.8304715156555176, "adv/std_reasoning": 0.7391616702079773, "adv/std_step_conf": 0.9329032897949219, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8231817271087959, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.19153543307086618, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7401574803149606, "calib/gap": 0.44428170707273107, "calib/mean_conf": 0.7983858267716535, "calib/mu_c": 0.9610559006211181, "calib/mu_w": 0.516774193548387, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17803149606299218, "calib/std_conf": 0.3566744815421641, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44985092491838957, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.11490142996889463, "calib/step_q_w": 0.33494949494949494, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 496.80859375, "completions/mean_terminated_length": 498.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1216, "grad_norm": 0.05756969749927521, "kl": 0.081695556640625, "learning_rate": 2.388888888888889e-06, "loss": 0.0596, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0352327898144722, "mask/share_reasoning": 0.8273290395736694, "mask/share_step_conf": 0.13353195786476135, "num_tokens": 27450628.0, "reward": 1.0205554962158203, "reward_std": 0.18426382541656494, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.903445839881897, "rewards/final_brier_reward_step": 0.8134465217590332, "rewards/format_reward_step": 0.9921875, "step": 114 }, { "adv/mean_abs_final_conf": 0.6113198399543762, "adv/mean_abs_reasoning": 0.49032288789749146, "adv/mean_abs_step_conf": 0.7666841745376587, "adv/ratio_final_to_reasoning": 1.2467699449554979, "adv/ratio_step_to_reasoning": 1.5636312182472385, "adv/std_final_conf": 0.8144198060035706, "adv/std_reasoning": 0.739387035369873, "adv/std_step_conf": 0.9330512881278992, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6646076046600947, "calib/avg_num_step_conf": 5.3671875, "calib/ece": 0.3328063241106719, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7747035573122529, "calib/gap": 0.16428946357700702, "calib/mean_conf": 0.8259288537549406, "calib/mu_c": 0.8954109589041097, "calib/mu_w": 0.7311214953271027, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29083003952569164, "calib/std_conf": 0.3312970480832067, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.45648578811369506, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.07716912144702831, "calib/step_q_w": 0.37931666666666675, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 466.1875, "completions/mean_terminated_length": 466.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.12266666666666666, "grad_norm": 0.0432070791721344, "kl": 0.07665252685546875, "learning_rate": 2.361111111111111e-06, "loss": 0.0476, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03583553060889244, "mask/share_reasoning": 0.835993766784668, "mask/share_step_conf": 0.12817072868347168, "num_tokens": 27675236.0, "reward": 0.9144834280014038, "reward_std": 0.20204773545265198, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.8649461269378662, "rewards/final_brier_reward_step": 0.6530832052230835, "rewards/format_reward_step": 0.984375, "step": 115 }, { "adv/mean_abs_final_conf": 0.6029642820358276, "adv/mean_abs_reasoning": 0.39965057373046875, "adv/mean_abs_step_conf": 0.7622563242912292, "adv/ratio_final_to_reasoning": 1.508728678674379, "adv/ratio_step_to_reasoning": 1.9073069686253175, "adv/std_final_conf": 0.8000175952911377, "adv/std_reasoning": 0.661288321018219, "adv/std_step_conf": 0.9327738285064697, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7190315315315317, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.3134901960784313, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.803921568627451, "calib/gap": 0.25926801801801813, "calib/mean_conf": 0.8483921568627453, "calib/mu_c": 0.9612500000000002, "calib/mu_w": 0.701981981981982, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2985882352941176, "calib/std_conf": 0.32429374716187354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4388196176226101, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.10645527653733883, "calib/step_q_w": 0.3323643410852713, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 524.25, "completions/mean_terminated_length": 524.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.12373333333333333, "grad_norm": 0.028108853846788406, "kl": 0.0692291259765625, "learning_rate": 2.3333333333333336e-06, "loss": 0.0478, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034591950476169586, "mask/share_reasoning": 0.838742733001709, "mask/share_step_conf": 0.1266653686761856, "num_tokens": 27913964.0, "reward": 0.9537807106971741, "reward_std": 0.19260621070861816, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.9025558829307556, "rewards/final_brier_reward_step": 0.6932867169380188, "rewards/format_reward_step": 0.99609375, "step": 116 }, { "adv/mean_abs_final_conf": 0.6685183048248291, "adv/mean_abs_reasoning": 0.4874965250492096, "adv/mean_abs_step_conf": 0.740862250328064, "adv/ratio_final_to_reasoning": 1.37132937461933, "adv/ratio_step_to_reasoning": 1.519728269351825, "adv/std_final_conf": 0.8593764305114746, "adv/std_reasoning": 0.7574522495269775, "adv/std_step_conf": 0.9332804679870605, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6033924680983505, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.44433070866141733, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8622047244094488, "calib/gap": 0.1423678804855275, "calib/mean_conf": 0.8918110236220473, "calib/mu_c": 0.9674789915966386, "calib/mu_w": 0.8251111111111111, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4338188976377953, "calib/std_conf": 0.27420153896899985, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.44812593703148423, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.06114805005359725, "calib/step_q_w": 0.386977886977887, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 502.703125, "completions/mean_terminated_length": 502.703125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1248, "grad_norm": 0.04808332771062851, "kl": 0.07646942138671875, "learning_rate": 2.305555555555556e-06, "loss": -0.0376, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03324298560619354, "mask/share_reasoning": 0.8397763967514038, "mask/share_step_conf": 0.12698057293891907, "num_tokens": 28149256.0, "reward": 0.8545684814453125, "reward_std": 0.2120169848203659, "rewards/accuracy_reward_step": 0.46484375, "rewards/asymmetric_l2_reward": 0.8599258661270142, "rewards/final_brier_reward_step": 0.5593671798706055, "rewards/format_reward_step": 0.984375, "step": 117 }, { "adv/mean_abs_final_conf": 0.5423208475112915, "adv/mean_abs_reasoning": 0.4119876027107239, "adv/mean_abs_step_conf": 0.757232129573822, "adv/ratio_final_to_reasoning": 1.3163523463886384, "adv/ratio_step_to_reasoning": 1.8379973683468112, "adv/std_final_conf": 0.7762859463691711, "adv/std_reasoning": 0.7013146877288818, "adv/std_step_conf": 0.9321770668029785, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6684493754982728, "calib/avg_num_step_conf": 7.18359375, "calib/ece": 0.3157258064516129, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8266129032258065, "calib/gap": 0.24094339622641503, "calib/mean_conf": 0.8620161290322582, "calib/mu_c": 0.965, "calib/mu_w": 0.7240566037735849, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.30258064516129035, "calib/std_conf": 0.3080400577184876, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4405574516496018, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.10321370164960181, "calib/step_q_w": 0.33734375, "calib/step_q_w_n": 960.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 588.1640625, "completions/mean_terminated_length": 590.4706420898438, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.12586666666666665, "grad_norm": 0.045879751443862915, "kl": 0.06640625, "learning_rate": 2.277777777777778e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03145633265376091, "mask/share_reasoning": 0.8349190950393677, "mask/share_step_conf": 0.1297183483839035, "num_tokens": 28403834.0, "reward": 0.9131325483322144, "reward_std": 0.18790775537490845, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8495236039161682, "rewards/final_brier_reward_step": 0.672835111618042, "rewards/format_reward_step": 0.96484375, "step": 118 }, { "adv/mean_abs_final_conf": 0.579788863658905, "adv/mean_abs_reasoning": 0.5056012868881226, "adv/mean_abs_step_conf": 0.7397458553314209, "adv/ratio_final_to_reasoning": 1.146731384382727, "adv/ratio_step_to_reasoning": 1.4631012114000195, "adv/std_final_conf": 0.797834575176239, "adv/std_reasoning": 0.7394025921821594, "adv/std_step_conf": 0.9339104890823364, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7308721765243504, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.2750200803212853, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7309236947791165, "calib/gap": 0.310268311790051, "calib/mean_conf": 0.7846586345381525, "calib/mu_c": 0.9229710144927536, "calib/mu_w": 0.6127027027027027, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25273092369477923, "calib/std_conf": 0.3692910606631113, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4636363636363636, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.15116311750081074, "calib/step_q_w": 0.3124732461355529, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 560.40625, "completions/mean_terminated_length": 562.6039428710938, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.12693333333333334, "grad_norm": 0.0336654931306839, "kl": 0.0772705078125, "learning_rate": 2.25e-06, "loss": -0.0541, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03116082400083542, "mask/share_reasoning": 0.8479986190795898, "mask/share_step_conf": 0.11693429946899414, "num_tokens": 28652362.0, "reward": 0.9408432841300964, "reward_std": 0.20290254056453705, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.8821717500686646, "rewards/final_brier_reward_step": 0.6971710920333862, "rewards/format_reward_step": 0.97265625, "step": 119 }, { "adv/mean_abs_final_conf": 0.6259399652481079, "adv/mean_abs_reasoning": 0.38975387811660767, "adv/mean_abs_step_conf": 0.7443583011627197, "adv/ratio_final_to_reasoning": 1.6059877794489512, "adv/ratio_step_to_reasoning": 1.9098162788261481, "adv/std_final_conf": 0.81560218334198, "adv/std_reasoning": 0.6612488627433777, "adv/std_step_conf": 0.933025598526001, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7535215776667947, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.23131474103585664, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6932270916334662, "calib/gap": 0.4136899731079523, "calib/mean_conf": 0.7381673306772908, "calib/mu_c": 0.9260583941605839, "calib/mu_w": 0.5123684210526316, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21183266932270922, "calib/std_conf": 0.40169539594567233, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4514522821576764, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.10832439309758396, "calib/step_q_w": 0.34312788906009245, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2444.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 492.75390625, "completions/mean_terminated_length": 494.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.128, "grad_norm": 0.03507012873888016, "kl": 0.07172393798828125, "learning_rate": 2.222222222222222e-06, "loss": -0.0091, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03331432864069939, "mask/share_reasoning": 0.845699667930603, "mask/share_step_conf": 0.1170797273516655, "num_tokens": 28885195.0, "reward": 0.9636802673339844, "reward_std": 0.18801307678222656, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.8802074193954468, "rewards/final_brier_reward_step": 0.7440280914306641, "rewards/format_reward_step": 0.98046875, "step": 120 }, { "adv/mean_abs_final_conf": 0.693418025970459, "adv/mean_abs_reasoning": 0.5543345212936401, "adv/mean_abs_step_conf": 0.734613835811615, "adv/ratio_final_to_reasoning": 1.250901755770581, "adv/ratio_step_to_reasoning": 1.325217549318885, "adv/std_final_conf": 0.8799854516983032, "adv/std_reasoning": 0.7755146622657776, "adv/std_step_conf": 0.9336560368537903, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6444260520786912, "calib/avg_num_step_conf": 6.58984375, "calib/ece": 0.33027777777777784, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7738095238095238, "calib/gap": 0.20913541732985297, "calib/mean_conf": 0.8230555555555555, "calib/mu_c": 0.9168345323741007, "calib/mu_w": 0.7076991150442478, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30087301587301596, "calib/std_conf": 0.33569725603172723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.410047281323877, "calib/step_q_c_n": 846.0, "calib/step_q_gap": 0.07631363090770577, "calib/step_q_w": 0.33373365041617126, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 566.36328125, "completions/mean_terminated_length": 568.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.12906666666666666, "grad_norm": 0.05059061199426651, "kl": 0.06748199462890625, "learning_rate": 2.1944444444444445e-06, "loss": 0.1015, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03129071742296219, "mask/share_reasoning": 0.8439393043518066, "mask/share_step_conf": 0.12086370587348938, "num_tokens": 29135240.0, "reward": 0.92536461353302, "reward_std": 0.23206710815429688, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8859968185424805, "rewards/final_brier_reward_step": 0.6592636704444885, "rewards/format_reward_step": 0.984375, "step": 121 }, { "adv/mean_abs_final_conf": 0.637088418006897, "adv/mean_abs_reasoning": 0.4608699381351471, "adv/mean_abs_step_conf": 0.7540398836135864, "adv/ratio_final_to_reasoning": 1.3823605431606063, "adv/ratio_step_to_reasoning": 1.63612295187817, "adv/std_final_conf": 0.842920184135437, "adv/std_reasoning": 0.7205365896224976, "adv/std_step_conf": 0.9331433176994324, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7871930050147871, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.18192156862745107, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6745098039215687, "calib/gap": 0.4593088594573743, "calib/mean_conf": 0.7199607843137256, "calib/mu_c": 0.9018831168831168, "calib/mu_w": 0.4425742574257425, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14898039215686285, "calib/std_conf": 0.40719171958838457, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4631063321385902, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.13792948326399213, "calib/step_q_w": 0.3251768488745981, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 493.8046875, "completions/mean_terminated_length": 493.8046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.13013333333333332, "grad_norm": 0.060518212616443634, "kl": 0.072021484375, "learning_rate": 2.166666666666667e-06, "loss": -0.0451, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034646786749362946, "mask/share_reasoning": 0.8425935506820679, "mask/share_step_conf": 0.12275967001914978, "num_tokens": 29368998.0, "reward": 1.011244773864746, "reward_std": 0.18983519077301025, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.9048250913619995, "rewards/final_brier_reward_step": 0.7981331944465637, "rewards/format_reward_step": 0.99609375, "step": 122 }, { "adv/mean_abs_final_conf": 0.7194132804870605, "adv/mean_abs_reasoning": 0.5484261512756348, "adv/mean_abs_step_conf": 0.7414064407348633, "adv/ratio_final_to_reasoning": 1.3117778552567398, "adv/ratio_step_to_reasoning": 1.3518801738581538, "adv/std_final_conf": 0.8892462849617004, "adv/std_reasoning": 0.792913556098938, "adv/std_step_conf": 0.9334316253662109, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7523837706319458, "calib/avg_num_step_conf": 6.34765625, "calib/ece": 0.2340725806451614, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5645161290322581, "calib/gap": 0.3882784244098114, "calib/mean_conf": 0.6297177419354838, "calib/mu_c": 0.8035036496350366, "calib/mu_w": 0.4152252252252252, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15568548387096784, "calib/std_conf": 0.431571959022196, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3978266331658291, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.07244665729128147, "calib/step_q_w": 0.32537997587454764, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2897.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 589.12109375, "completions/mean_terminated_length": 591.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.1312, "grad_norm": 0.038368623703718185, "kl": 0.0644683837890625, "learning_rate": 2.138888888888889e-06, "loss": 0.0179, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02929634228348732, "mask/share_reasoning": 0.8502581119537354, "mask/share_step_conf": 0.1165393590927124, "num_tokens": 29625101.0, "reward": 0.9518745541572571, "reward_std": 0.21453779935836792, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.8731791973114014, "rewards/final_brier_reward_step": 0.7290074229240417, "rewards/format_reward_step": 0.96875, "step": 123 }, { "adv/mean_abs_final_conf": 0.6313794851303101, "adv/mean_abs_reasoning": 0.46017879247665405, "adv/mean_abs_step_conf": 0.7264816761016846, "adv/ratio_final_to_reasoning": 1.3720308181354128, "adv/ratio_step_to_reasoning": 1.578694385701272, "adv/std_final_conf": 0.85479336977005, "adv/std_reasoning": 0.7573562860488892, "adv/std_step_conf": 0.9332188367843628, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7461487820934826, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.2508300395256916, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5810276679841897, "calib/gap": 0.3679802501645819, "calib/mean_conf": 0.6370750988142293, "calib/mu_c": 0.7796129032258065, "calib/mu_w": 0.41163265306122454, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13762845849802363, "calib/std_conf": 0.43541918185282724, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4402631578947368, "calib/step_q_c_n": 874.0, "calib/step_q_gap": 0.0714923937751355, "calib/step_q_w": 0.3687707641196013, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 528.83203125, "completions/mean_terminated_length": 528.83203125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.13226666666666667, "grad_norm": 0.10215196013450623, "kl": 0.08013153076171875, "learning_rate": 2.1111111111111114e-06, "loss": -0.0337, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03226089105010033, "mask/share_reasoning": 0.8445743322372437, "mask/share_step_conf": 0.12316481024026871, "num_tokens": 29867298.0, "reward": 0.9671303033828735, "reward_std": 0.17288029193878174, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8771121501922607, "rewards/final_brier_reward_step": 0.7383984327316284, "rewards/format_reward_step": 0.98828125, "step": 124 }, { "adv/mean_abs_final_conf": 0.7052055597305298, "adv/mean_abs_reasoning": 0.4439585208892822, "adv/mean_abs_step_conf": 0.7640683650970459, "adv/ratio_final_to_reasoning": 1.5884492053851116, "adv/ratio_step_to_reasoning": 1.7210354777436407, "adv/std_final_conf": 0.8930036425590515, "adv/std_reasoning": 0.739206075668335, "adv/std_step_conf": 0.932820737361908, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6647173489278753, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.31192771084337334, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5943775100401606, "calib/gap": 0.25868226120857696, "calib/mean_conf": 0.635863453815261, "calib/mu_c": 0.7542962962962962, "calib/mu_w": 0.4956140350877193, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20281124497991954, "calib/std_conf": 0.4397463642706736, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44329396325459314, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.10667917726237525, "calib/step_q_w": 0.3366147859922179, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 530.0, "completions/mean_terminated_length": 534.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.13333333333333333, "grad_norm": 0.03873305022716522, "kl": 0.06960296630859375, "learning_rate": 2.0833333333333334e-06, "loss": -0.0536, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03307991474866867, "mask/share_reasoning": 0.8385022878646851, "mask/share_step_conf": 0.12060528248548508, "num_tokens": 30107786.0, "reward": 0.8997880220413208, "reward_std": 0.20577451586723328, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8392912149429321, "rewards/final_brier_reward_step": 0.6595035195350647, "rewards/format_reward_step": 0.97265625, "step": 125 }, { "adv/mean_abs_final_conf": 0.6224679350852966, "adv/mean_abs_reasoning": 0.45161741971969604, "adv/mean_abs_step_conf": 0.7560645341873169, "adv/ratio_final_to_reasoning": 1.378308072066046, "adv/ratio_step_to_reasoning": 1.67412615451499, "adv/std_final_conf": 0.8212738633155823, "adv/std_reasoning": 0.7205803990364075, "adv/std_step_conf": 0.931928813457489, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.797762148337596, "calib/avg_num_step_conf": 6.77734375, "calib/ece": 0.20988047808764945, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5617529880478087, "calib/gap": 0.43679859335038357, "calib/mean_conf": 0.6259760956175299, "calib/mu_c": 0.8261029411764705, "calib/mu_w": 0.38930434782608697, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1470119521912351, "calib/std_conf": 0.43423622491700775, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42935049019607846, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.1333330799675692, "calib/step_q_w": 0.29601741022850925, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 555.703125, "completions/mean_terminated_length": 557.8823852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1344, "grad_norm": 0.051755405962467194, "kl": 0.06482315063476562, "learning_rate": 2.0555555555555555e-06, "loss": -0.027, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03272949904203415, "mask/share_reasoning": 0.8290582299232483, "mask/share_step_conf": 0.13430599868297577, "num_tokens": 30355510.0, "reward": 0.9703141450881958, "reward_std": 0.1568230241537094, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8804025650024414, "rewards/final_brier_reward_step": 0.7578819990158081, "rewards/format_reward_step": 0.98046875, "step": 126 }, { "adv/mean_abs_final_conf": 0.650374174118042, "adv/mean_abs_reasoning": 0.46083492040634155, "adv/mean_abs_step_conf": 0.7511869668960571, "adv/ratio_final_to_reasoning": 1.4112953366133236, "adv/ratio_step_to_reasoning": 1.630056520529515, "adv/std_final_conf": 0.8628984093666077, "adv/std_reasoning": 0.739283561706543, "adv/std_step_conf": 0.9332513809204102, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8018252933507171, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.21907258064516133, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5362903225806451, "calib/gap": 0.46292959582790094, "calib/mean_conf": 0.5784274193548387, "calib/mu_c": 0.7986923076923077, "calib/mu_w": 0.33576271186440676, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13665322580645164, "calib/std_conf": 0.4548098091224347, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4438493150684931, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.14875194966184935, "calib/step_q_w": 0.2950973654066438, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 549.046875, "completions/mean_terminated_length": 549.046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.13546666666666668, "grad_norm": 0.035358961671590805, "kl": 0.07138442993164062, "learning_rate": 2.027777777777778e-06, "loss": 0.1926, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03422444313764572, "mask/share_reasoning": 0.831305980682373, "mask/share_step_conf": 0.13446959853172302, "num_tokens": 30599738.0, "reward": 0.9445489645004272, "reward_std": 0.20476898550987244, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.860126793384552, "rewards/final_brier_reward_step": 0.7360023260116577, "rewards/format_reward_step": 0.95703125, "step": 127 }, { "adv/mean_abs_final_conf": 0.704669177532196, "adv/mean_abs_reasoning": 0.5308483839035034, "adv/mean_abs_step_conf": 0.761113166809082, "adv/ratio_final_to_reasoning": 1.3274396209903305, "adv/ratio_step_to_reasoning": 1.43376751232125, "adv/std_final_conf": 0.85466468334198, "adv/std_reasoning": 0.7577895522117615, "adv/std_step_conf": 0.9339450001716614, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7072817820849318, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.27745901639344256, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5778688524590164, "calib/gap": 0.3277353792314422, "calib/mean_conf": 0.6243442622950819, "calib/mu_c": 0.781496062992126, "calib/mu_w": 0.45376068376068385, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.190655737704918, "calib/std_conf": 0.44240121191419035, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4389984101748808, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.10517449427474934, "calib/step_q_w": 0.33382391590013144, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 533.0390625, "completions/mean_terminated_length": 539.3596801757812, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.13653333333333334, "grad_norm": 0.026687778532505035, "kl": 0.0691680908203125, "learning_rate": 2.0000000000000003e-06, "loss": -0.0484, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.034037213772535324, "mask/share_reasoning": 0.8381680846214294, "mask/share_step_conf": 0.11607595533132553, "num_tokens": 30842860.0, "reward": 0.8984044790267944, "reward_std": 0.2292974293231964, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.8419756889343262, "rewards/final_brier_reward_step": 0.666551947593689, "rewards/format_reward_step": 0.9453125, "step": 128 }, { "adv/mean_abs_final_conf": 0.5893256664276123, "adv/mean_abs_reasoning": 0.3948642611503601, "adv/mean_abs_step_conf": 0.7625530958175659, "adv/ratio_final_to_reasoning": 1.4924765910967146, "adv/ratio_step_to_reasoning": 1.9311778016982748, "adv/std_final_conf": 0.8301159143447876, "adv/std_reasoning": 0.6815221309661865, "adv/std_step_conf": 0.9312430024147034, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6886455219030286, "calib/avg_num_step_conf": 6.2890625, "calib/ece": 0.2615294117647058, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6627450980392157, "calib/gap": 0.2917977382035617, "calib/mean_conf": 0.7284313725490196, "calib/mu_c": 0.8405732484076434, "calib/mu_w": 0.5487755102040817, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18713725490196073, "calib/std_conf": 0.4012772380298454, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43962121212121213, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.10281363194628501, "calib/step_q_w": 0.3368075801749271, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 483.7578125, "completions/mean_terminated_length": 483.7578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1376, "grad_norm": 0.042509667575359344, "kl": 0.07884979248046875, "learning_rate": 1.9722222222222224e-06, "loss": 0.0618, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0361248143017292, "mask/share_reasoning": 0.8258543610572815, "mask/share_step_conf": 0.1380208432674408, "num_tokens": 31069086.0, "reward": 0.970730721950531, "reward_std": 0.1526256501674652, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.8946923017501831, "rewards/final_brier_reward_step": 0.7248941659927368, "rewards/format_reward_step": 0.99609375, "step": 129 }, { "adv/mean_abs_final_conf": 0.5929268002510071, "adv/mean_abs_reasoning": 0.2917104959487915, "adv/mean_abs_step_conf": 0.7566444873809814, "adv/ratio_final_to_reasoning": 2.0325864461013183, "adv/ratio_step_to_reasoning": 2.5938198929729532, "adv/std_final_conf": 0.8083614706993103, "adv/std_reasoning": 0.5727423429489136, "adv/std_step_conf": 0.9330140948295593, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7233320147679325, "calib/avg_num_step_conf": 5.33203125, "calib/ece": 0.2249212598425196, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6889763779527559, "calib/gap": 0.385006592827004, "calib/mean_conf": 0.7233464566929134, "calib/mu_c": 0.8688607594936707, "calib/mu_w": 0.4838541666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16311023622047235, "calib/std_conf": 0.41253229657299734, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48749683944374206, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.103942832475101, "calib/step_q_w": 0.38355400696864106, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 471.3984375, "completions/mean_terminated_length": 471.3984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.13866666666666666, "grad_norm": 0.06835480779409409, "kl": 0.07553863525390625, "learning_rate": 1.944444444444445e-06, "loss": -0.0023, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03732430934906006, "mask/share_reasoning": 0.8383286595344543, "mask/share_step_conf": 0.1243470311164856, "num_tokens": 31295052.0, "reward": 0.9831303358078003, "reward_std": 0.14991185069084167, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8848813772201538, "rewards/final_brier_reward_step": 0.7595043182373047, "rewards/format_reward_step": 0.9921875, "step": 130 }, { "adv/mean_abs_final_conf": 0.6311858892440796, "adv/mean_abs_reasoning": 0.2988106906414032, "adv/mean_abs_step_conf": 0.7364592552185059, "adv/ratio_final_to_reasoning": 2.1123269983722013, "adv/ratio_step_to_reasoning": 2.4646348952163697, "adv/std_final_conf": 0.8326376676559448, "adv/std_reasoning": 0.6184049248695374, "adv/std_step_conf": 0.9334642291069031, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7967821142414061, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.2596825396825397, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5992063492063492, "calib/gap": 0.44134401654174205, "calib/mean_conf": 0.6542063492063492, "calib/mu_c": 0.9099056603773585, "calib/mu_w": 0.4685616438356165, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24662698412698414, "calib/std_conf": 0.43570204828566333, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4780145719489981, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.13446210671511694, "calib/step_q_w": 0.34355246523388117, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 479.078125, "completions/mean_terminated_length": 479.078125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.13973333333333332, "grad_norm": 0.06330462545156479, "kl": 0.06920623779296875, "learning_rate": 1.916666666666667e-06, "loss": -0.0458, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03542075306177139, "mask/share_reasoning": 0.8427149057388306, "mask/share_step_conf": 0.12186426669359207, "num_tokens": 31523904.0, "reward": 0.9323045015335083, "reward_std": 0.1720176488161087, "rewards/accuracy_reward_step": 0.4140625, "rewards/asymmetric_l2_reward": 0.8692620396614075, "rewards/final_brier_reward_step": 0.7156593799591064, "rewards/format_reward_step": 0.984375, "step": 131 }, { "adv/mean_abs_final_conf": 0.5419154167175293, "adv/mean_abs_reasoning": 0.39009517431259155, "adv/mean_abs_step_conf": 0.7669232487678528, "adv/ratio_final_to_reasoning": 1.3891876967523853, "adv/ratio_step_to_reasoning": 1.9659900949025861, "adv/std_final_conf": 0.7634062170982361, "adv/std_reasoning": 0.6612535119056702, "adv/std_step_conf": 0.932680070400238, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7523074894514769, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.24881889763779533, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7007874015748031, "calib/gap": 0.348175105485232, "calib/mean_conf": 0.7357480314960629, "calib/mu_c": 0.8673417721518987, "calib/mu_w": 0.5191666666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1812598425196851, "calib/std_conf": 0.40682372449389625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4939951573849879, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.158306632794824, "calib/step_q_w": 0.3356885245901639, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 509.984375, "completions/mean_terminated_length": 509.984375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1408, "grad_norm": 0.027949035167694092, "kl": 0.0719757080078125, "learning_rate": 1.888888888888889e-06, "loss": 0.0022, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03615806996822357, "mask/share_reasoning": 0.8362313508987427, "mask/share_step_conf": 0.12761051952838898, "num_tokens": 31760052.0, "reward": 0.9794524908065796, "reward_std": 0.16386666893959045, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8927135467529297, "rewards/final_brier_reward_step": 0.7443163394927979, "rewards/format_reward_step": 0.9921875, "step": 132 }, { "adv/mean_abs_final_conf": 0.7018519043922424, "adv/mean_abs_reasoning": 0.4902651607990265, "adv/mean_abs_step_conf": 0.7454802989959717, "adv/ratio_final_to_reasoning": 1.4315761357555474, "adv/ratio_step_to_reasoning": 1.5205655196485908, "adv/std_final_conf": 0.8919135332107544, "adv/std_reasoning": 0.7752693295478821, "adv/std_step_conf": 0.9336416721343994, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7900599270453361, "calib/avg_num_step_conf": 6.7890625, "calib/ece": 0.266600790513834, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5296442687747036, "calib/gap": 0.39868290776446064, "calib/mean_conf": 0.5900790513833992, "calib/mu_c": 0.8296039603960396, "calib/mu_w": 0.4309210526315789, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22873517786561268, "calib/std_conf": 0.44594793302387903, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40628205128205136, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.07311602811602819, "calib/step_q_w": 0.3331660231660232, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 595.953125, "completions/mean_terminated_length": 598.2902221679688, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.14186666666666667, "grad_norm": 0.05951628088951111, "kl": 0.0603179931640625, "learning_rate": 1.8611111111111113e-06, "loss": -0.0138, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028608456254005432, "mask/share_reasoning": 0.8479064702987671, "mask/share_step_conf": 0.11957882344722748, "num_tokens": 32018960.0, "reward": 0.9371469020843506, "reward_std": 0.21386194229125977, "rewards/accuracy_reward_step": 0.39453125, "rewards/asymmetric_l2_reward": 0.8900238871574402, "rewards/final_brier_reward_step": 0.707707405090332, "rewards/format_reward_step": 0.98828125, "step": 133 }, { "adv/mean_abs_final_conf": 0.7257549166679382, "adv/mean_abs_reasoning": 0.49450692534446716, "adv/mean_abs_step_conf": 0.752888560295105, "adv/ratio_final_to_reasoning": 1.4676334738131052, "adv/ratio_step_to_reasoning": 1.522503572160597, "adv/std_final_conf": 0.8783618807792664, "adv/std_reasoning": 0.7394267320632935, "adv/std_step_conf": 0.9331688284873962, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6966345244847797, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.30107142857142855, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6111111111111112, "calib/gap": 0.30402155416903004, "calib/mean_conf": 0.6541666666666667, "calib/mu_c": 0.8025581395348838, "calib/mu_w": 0.4985365853658537, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22166666666666668, "calib/std_conf": 0.4401944240359945, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42251948051948046, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.09624288477479959, "calib/step_q_w": 0.3262765957446809, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 585.1484375, "completions/mean_terminated_length": 587.4431762695312, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.14293333333333333, "grad_norm": 0.02902313508093357, "kl": 0.0628509521484375, "learning_rate": 1.8333333333333333e-06, "loss": -0.0718, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030042003840208054, "mask/share_reasoning": 0.8552829027175903, "mask/share_step_conf": 0.11076889932155609, "num_tokens": 32277710.0, "reward": 0.925879716873169, "reward_std": 0.22603853046894073, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.8767973184585571, "rewards/final_brier_reward_step": 0.6773058176040649, "rewards/format_reward_step": 0.984375, "step": 134 }, { "adv/mean_abs_final_conf": 0.6498154997825623, "adv/mean_abs_reasoning": 0.3868643045425415, "adv/mean_abs_step_conf": 0.7391627430915833, "adv/ratio_final_to_reasoning": 1.6796987784927708, "adv/ratio_step_to_reasoning": 1.9106511880583732, "adv/std_final_conf": 0.8298484683036804, "adv/std_reasoning": 0.6614306569099426, "adv/std_step_conf": 0.9338327646255493, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6963808760683761, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.2699592741935484, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.625, "calib/gap": 0.33560432692307685, "calib/mean_conf": 0.6729439516129033, "calib/mu_c": 0.8136812499999999, "calib/mu_w": 0.47807692307692307, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18112903225806457, "calib/std_conf": 0.4334902361093346, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4100843373493976, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.08741679464243307, "calib/step_q_w": 0.3226675427069645, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 573.79296875, "completions/mean_terminated_length": 573.79296875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.144, "grad_norm": 0.03258584067225456, "kl": 0.058887481689453125, "learning_rate": 1.8055555555555557e-06, "loss": 0.0267, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03373143821954727, "mask/share_reasoning": 0.8400686383247375, "mask/share_step_conf": 0.12619991600513458, "num_tokens": 32530481.0, "reward": 0.9410494565963745, "reward_std": 0.21206125617027283, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.8774411678314209, "rewards/final_brier_reward_step": 0.6991890668869019, "rewards/format_reward_step": 0.96484375, "step": 135 }, { "adv/mean_abs_final_conf": 0.6550817489624023, "adv/mean_abs_reasoning": 0.5279697775840759, "adv/mean_abs_step_conf": 0.7136498093605042, "adv/ratio_final_to_reasoning": 1.240756150778128, "adv/ratio_step_to_reasoning": 1.3516868572024652, "adv/std_final_conf": 0.8621448874473572, "adv/std_reasoning": 0.7927603721618652, "adv/std_step_conf": 0.9325664043426514, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7687752016129032, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.25138888888888883, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5317460317460317, "calib/gap": 0.3933266129032257, "calib/mean_conf": 0.5791666666666666, "calib/mu_c": 0.7789516129032257, "calib/mu_w": 0.385625, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16924603174603167, "calib/std_conf": 0.4533461351483745, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42105726872246696, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.16384820590063381, "calib/step_q_w": 0.25720906282183315, "calib/step_q_w_n": 971.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 528.12890625, "completions/mean_terminated_length": 530.2000122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.14506666666666668, "grad_norm": 0.04807325452566147, "kl": 0.07080078125, "learning_rate": 1.777777777777778e-06, "loss": 0.0226, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032773517072200775, "mask/share_reasoning": 0.830197274684906, "mask/share_step_conf": 0.13312296569347382, "num_tokens": 32774170.0, "reward": 0.9568853974342346, "reward_std": 0.19861853122711182, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.9026017189025879, "rewards/final_brier_reward_step": 0.7182003855705261, "rewards/format_reward_step": 0.98046875, "step": 136 }, { "adv/mean_abs_final_conf": 0.6266046762466431, "adv/mean_abs_reasoning": 0.4091912508010864, "adv/mean_abs_step_conf": 0.7550874352455139, "adv/ratio_final_to_reasoning": 1.5313247167917683, "adv/ratio_step_to_reasoning": 1.8453166673707118, "adv/std_final_conf": 0.8365187644958496, "adv/std_reasoning": 0.6816372275352478, "adv/std_step_conf": 0.9328610301017761, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7627720120522263, "calib/avg_num_step_conf": 6.84375, "calib/ece": 0.22254032258064518, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6008064516129032, "calib/gap": 0.42354670237696684, "calib/mean_conf": 0.6422983870967742, "calib/mu_c": 0.8182068965517242, "calib/mu_w": 0.39466019417475734, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14008064516129035, "calib/std_conf": 0.44154781788756414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40336353340883346, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.13861094422586456, "calib/step_q_w": 0.2647525891829689, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 539.8125, "completions/mean_terminated_length": 546.2134399414062, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.14613333333333334, "grad_norm": 0.04263077676296234, "kl": 0.0669097900390625, "learning_rate": 1.75e-06, "loss": -0.0123, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03168744221329689, "mask/share_reasoning": 0.8275086879730225, "mask/share_step_conf": 0.12908512353897095, "num_tokens": 33019346.0, "reward": 0.9610379934310913, "reward_std": 0.19014747440814972, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.8790087103843689, "rewards/final_brier_reward_step": 0.7368171215057373, "rewards/format_reward_step": 0.96484375, "step": 137 }, { "adv/mean_abs_final_conf": 0.6476581692695618, "adv/mean_abs_reasoning": 0.5334905385971069, "adv/mean_abs_step_conf": 0.7643733620643616, "adv/ratio_final_to_reasoning": 1.2140012285366404, "adv/ratio_step_to_reasoning": 1.4327777284942962, "adv/std_final_conf": 0.8558839559555054, "adv/std_reasoning": 0.7754238247871399, "adv/std_step_conf": 0.9321677684783936, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7955010224948875, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.18932806324110668, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5968379446640316, "calib/gap": 0.4857866394001363, "calib/mean_conf": 0.6537549407114623, "calib/mu_c": 0.8265644171779141, "calib/mu_w": 0.3407777777777778, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09940711462450587, "calib/std_conf": 0.42963415269661276, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4080204778156996, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.1297233158791387, "calib/step_q_w": 0.2782971619365609, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 496.4921875, "completions/mean_terminated_length": 498.4392395019531, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.1472, "grad_norm": 0.04252735897898674, "kl": 0.07735443115234375, "learning_rate": 1.7222222222222224e-06, "loss": -0.0624, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03474820777773857, "mask/share_reasoning": 0.8353586792945862, "mask/share_step_conf": 0.12598684430122375, "num_tokens": 33250784.0, "reward": 1.007904052734375, "reward_std": 0.1891259253025055, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.8937262892723083, "rewards/final_brier_reward_step": 0.7978628873825073, "rewards/format_reward_step": 0.984375, "step": 138 }, { "adv/mean_abs_final_conf": 0.6581767797470093, "adv/mean_abs_reasoning": 0.42416638135910034, "adv/mean_abs_step_conf": 0.7499033212661743, "adv/ratio_final_to_reasoning": 1.5516948270112787, "adv/ratio_step_to_reasoning": 1.7679461509027616, "adv/std_final_conf": 0.8394871950149536, "adv/std_reasoning": 0.7013220191001892, "adv/std_step_conf": 0.9318543672561646, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7805037191700379, "calib/avg_num_step_conf": 5.53515625, "calib/ece": 0.19721568627450975, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5450980392156862, "calib/gap": 0.4600561137935535, "calib/mean_conf": 0.6047450980392157, "calib/mu_c": 0.779746835443038, "calib/mu_w": 0.31969072164948453, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09117647058823526, "calib/std_conf": 0.4437459016386166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4019053398058252, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.10600314756299212, "calib/step_q_w": 0.2959021922428331, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 459.67578125, "completions/mean_terminated_length": 459.67578125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.14826666666666666, "grad_norm": 0.07648757100105286, "kl": 0.086883544921875, "learning_rate": 1.6944444444444446e-06, "loss": 0.0081, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03702244907617569, "mask/share_reasoning": 0.8356660604476929, "mask/share_step_conf": 0.12731149792671204, "num_tokens": 33471557.0, "reward": 1.0029046535491943, "reward_std": 0.16366134583950043, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.9031118750572205, "rewards/final_brier_reward_step": 0.7808222770690918, "rewards/format_reward_step": 0.9921875, "step": 139 }, { "adv/mean_abs_final_conf": 0.5114564895629883, "adv/mean_abs_reasoning": 0.33423930406570435, "adv/mean_abs_step_conf": 0.7621335983276367, "adv/ratio_final_to_reasoning": 1.5302104909315117, "adv/ratio_step_to_reasoning": 2.280203402343781, "adv/std_final_conf": 0.7727295160293579, "adv/std_reasoning": 0.6610760688781738, "adv/std_step_conf": 0.9312514066696167, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8144219396806622, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.1538976377952756, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6850393700787402, "calib/gap": 0.5125340035481964, "calib/mean_conf": 0.7275984251968504, "calib/mu_c": 0.8809550561797753, "calib/mu_w": 0.3684210526315789, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0903543307086614, "calib/std_conf": 0.4108740205767719, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44658747300215984, "calib/step_q_c_n": 926.0, "calib/step_q_gap": 0.15103191744660427, "calib/step_q_w": 0.29555555555555557, "calib/step_q_w_n": 450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 500.43359375, "completions/mean_terminated_length": 500.43359375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.14933333333333335, "grad_norm": 0.041224293410778046, "kl": 0.0767364501953125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0673, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03436052054166794, "mask/share_reasoning": 0.8470271825790405, "mask/share_step_conf": 0.11861232668161392, "num_tokens": 33704684.0, "reward": 1.0392565727233887, "reward_std": 0.12018037587404251, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.9118223190307617, "rewards/final_brier_reward_step": 0.8291909694671631, "rewards/format_reward_step": 0.9921875, "step": 140 }, { "adv/mean_abs_final_conf": 0.49427223205566406, "adv/mean_abs_reasoning": 0.4047102928161621, "adv/mean_abs_step_conf": 0.7349098920822144, "adv/ratio_final_to_reasoning": 1.2212988916498475, "adv/ratio_step_to_reasoning": 1.8158912810652037, "adv/std_final_conf": 0.7135685682296753, "adv/std_reasoning": 0.6816495060920715, "adv/std_step_conf": 0.9325038194656372, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.881882576310793, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.10734939759036147, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.642570281124498, "calib/gap": 0.6617306849715201, "calib/mean_conf": 0.6774698795180722, "calib/mu_c": 0.895389221556886, "calib/mu_w": 0.2336585365853659, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.057068273092369504, "calib/std_conf": 0.43568395210389493, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39104166666666673, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.12533534696406445, "calib/step_q_w": 0.2657063197026023, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 538.55078125, "completions/mean_terminated_length": 540.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.1504, "grad_norm": 0.03272243216633797, "kl": 0.063079833984375, "learning_rate": 1.638888888888889e-06, "loss": 0.0454, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032417528331279755, "mask/share_reasoning": 0.8407843112945557, "mask/share_step_conf": 0.12289191037416458, "num_tokens": 33949649.0, "reward": 1.0384899377822876, "reward_std": 0.14921121299266815, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.8945099115371704, "rewards/final_brier_reward_step": 0.857469916343689, "rewards/format_reward_step": 0.97265625, "step": 141 }, { "adv/mean_abs_final_conf": 0.6672189831733704, "adv/mean_abs_reasoning": 0.5137597918510437, "adv/mean_abs_step_conf": 0.7419067621231079, "adv/ratio_final_to_reasoning": 1.2986983289786518, "adv/ratio_step_to_reasoning": 1.4440732301180388, "adv/std_final_conf": 0.8465292453765869, "adv/std_reasoning": 0.7576342821121216, "adv/std_step_conf": 0.9329071640968323, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7659574468085107, "calib/avg_num_step_conf": 6.24609375, "calib/ece": 0.2413095238095237, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5396825396825397, "calib/gap": 0.40900709219858167, "calib/mean_conf": 0.5888492063492063, "calib/mu_c": 0.7690070921985817, "calib/mu_w": 0.36, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1353174603174602, "calib/std_conf": 0.45452646853028067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3878585308056873, "calib/step_q_c_n": 844.0, "calib/step_q_gap": 0.09809694140171377, "calib/step_q_w": 0.2897615894039735, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 547.94921875, "completions/mean_terminated_length": 547.94921875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.15146666666666667, "grad_norm": 0.045452363789081573, "kl": 0.07000732421875, "learning_rate": 1.6111111111111113e-06, "loss": 0.0256, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03237912803888321, "mask/share_reasoning": 0.842139482498169, "mask/share_step_conf": 0.12548136711120605, "num_tokens": 34195084.0, "reward": 0.9666558504104614, "reward_std": 0.18624988198280334, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.8902691602706909, "rewards/final_brier_reward_step": 0.736011266708374, "rewards/format_reward_step": 0.984375, "step": 142 }, { "adv/mean_abs_final_conf": 0.5834546089172363, "adv/mean_abs_reasoning": 0.4283401668071747, "adv/mean_abs_step_conf": 0.7408386468887329, "adv/ratio_final_to_reasoning": 1.3621291070278947, "adv/ratio_step_to_reasoning": 1.7295567969049124, "adv/std_final_conf": 0.8025461435317993, "adv/std_reasoning": 0.7204607129096985, "adv/std_step_conf": 0.9318289160728455, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8720285790598292, "calib/avg_num_step_conf": 6.3671875, "calib/ece": 0.1430645161290321, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6008064516129032, "calib/gap": 0.5630608974358975, "calib/mean_conf": 0.6624193548387097, "calib/mu_c": 0.8985416666666668, "calib/mu_w": 0.33548076923076925, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11241935483870953, "calib/std_conf": 0.4251126434551085, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4168470906630582, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.19198738247001668, "calib/step_q_w": 0.22485970819304152, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 567.859375, "completions/mean_terminated_length": 567.859375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.15253333333333333, "grad_norm": 0.05021341145038605, "kl": 0.06475830078125, "learning_rate": 1.5833333333333333e-06, "loss": 0.0437, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0319594144821167, "mask/share_reasoning": 0.8436037302017212, "mask/share_step_conf": 0.12443678081035614, "num_tokens": 34447792.0, "reward": 1.0080523490905762, "reward_std": 0.16858679056167603, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.892905592918396, "rewards/final_brier_reward_step": 0.8169492483139038, "rewards/format_reward_step": 0.96875, "step": 143 }, { "adv/mean_abs_final_conf": 0.5916790962219238, "adv/mean_abs_reasoning": 0.46679311990737915, "adv/mean_abs_step_conf": 0.7288160920143127, "adv/ratio_final_to_reasoning": 1.2675403106612293, "adv/ratio_step_to_reasoning": 1.5613256942581417, "adv/std_final_conf": 0.8130344152450562, "adv/std_reasoning": 0.7393408417701721, "adv/std_step_conf": 0.9324739575386047, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7388970588235293, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.24780000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.39191911764705895, "calib/mean_conf": 0.63188, "calib/mu_c": 0.7572941176470589, "calib/mu_w": 0.36537499999999995, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09984000000000003, "calib/std_conf": 0.4510858738643896, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3856683937823834, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.09519502100131827, "calib/step_q_w": 0.2904733727810651, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 542.84765625, "completions/mean_terminated_length": 544.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.1536, "grad_norm": 0.04553085193037987, "kl": 0.0696868896484375, "learning_rate": 1.5555555555555558e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033757805824279785, "mask/share_reasoning": 0.8401012420654297, "mask/share_step_conf": 0.12223471701145172, "num_tokens": 34690889.0, "reward": 0.9717831611633301, "reward_std": 0.16983790695667267, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.8857837915420532, "rewards/final_brier_reward_step": 0.7296574115753174, "rewards/format_reward_step": 0.9765625, "step": 144 }, { "adv/mean_abs_final_conf": 0.6282063722610474, "adv/mean_abs_reasoning": 0.5565600395202637, "adv/mean_abs_step_conf": 0.7417995929718018, "adv/ratio_final_to_reasoning": 1.128730644770222, "adv/ratio_step_to_reasoning": 1.3328294169506105, "adv/std_final_conf": 0.8449404835700989, "adv/std_reasoning": 0.8097800612449646, "adv/std_step_conf": 0.9331483244895935, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6935672514619883, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.2536254980079681, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7171314741035857, "calib/gap": 0.255464912280702, "calib/mean_conf": 0.7645418326693229, "calib/mu_c": 0.8459649122807019, "calib/mu_w": 0.5904999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16844621513944222, "calib/std_conf": 0.3863622472481406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40837301587301583, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.13369999134985505, "calib/step_q_w": 0.2746730245231608, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 501.26953125, "completions/mean_terminated_length": 503.2353210449219, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.15466666666666667, "grad_norm": 0.051369134336709976, "kl": 0.08301544189453125, "learning_rate": 1.527777777777778e-06, "loss": 0.1476, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03623100742697716, "mask/share_reasoning": 0.8176732063293457, "mask/share_step_conf": 0.14218956232070923, "num_tokens": 34921918.0, "reward": 0.9706292152404785, "reward_std": 0.20070935785770416, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.888382613658905, "rewards/final_brier_reward_step": 0.7231882810592651, "rewards/format_reward_step": 0.98046875, "step": 145 }, { "adv/mean_abs_final_conf": 0.679091215133667, "adv/mean_abs_reasoning": 0.48408961296081543, "adv/mean_abs_step_conf": 0.7339562177658081, "adv/ratio_final_to_reasoning": 1.4028212895958914, "adv/ratio_step_to_reasoning": 1.5161577487208304, "adv/std_final_conf": 0.8739967942237854, "adv/std_reasoning": 0.7575613856315613, "adv/std_step_conf": 0.9337763786315918, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7498391248391247, "calib/avg_num_step_conf": 5.89453125, "calib/ece": 0.2546613545816732, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5976095617529881, "calib/gap": 0.42930566280566274, "calib/mean_conf": 0.6447808764940238, "calib/mu_c": 0.8842342342342342, "calib/mu_w": 0.45492857142857146, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22860557768924294, "calib/std_conf": 0.43954187916360593, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42036423841059606, "calib/step_q_c_n": 604.0, "calib/step_q_gap": 0.12298302294098279, "calib/step_q_w": 0.29738121546961327, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 533.35546875, "completions/mean_terminated_length": 535.4470825195312, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.15573333333333333, "grad_norm": 0.03525509685277939, "kl": 0.0661773681640625, "learning_rate": 1.5e-06, "loss": -0.069, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03156570717692375, "mask/share_reasoning": 0.8492813110351562, "mask/share_step_conf": 0.1152467131614685, "num_tokens": 35165673.0, "reward": 0.9413388967514038, "reward_std": 0.20831407606601715, "rewards/accuracy_reward_step": 0.43359375, "rewards/asymmetric_l2_reward": 0.8832393884658813, "rewards/final_brier_reward_step": 0.7166258096694946, "rewards/format_reward_step": 0.98046875, "step": 146 }, { "adv/mean_abs_final_conf": 0.6118881702423096, "adv/mean_abs_reasoning": 0.44582316279411316, "adv/mean_abs_step_conf": 0.7450541257858276, "adv/ratio_final_to_reasoning": 1.3724907571141327, "adv/ratio_step_to_reasoning": 1.671187564854954, "adv/std_final_conf": 0.8005033135414124, "adv/std_reasoning": 0.7206440567970276, "adv/std_step_conf": 0.9336183667182922, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6944263787721123, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.2990725806451613, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6330645161290323, "calib/gap": 0.32943548387096777, "calib/mean_conf": 0.6661693548387098, "calib/mu_c": 0.8308870967741936, "calib/mu_w": 0.5014516129032258, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23262096774193552, "calib/std_conf": 0.44329803601898965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43249639249639243, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.11671039513840037, "calib/step_q_w": 0.31578599735799207, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 531.06640625, "completions/mean_terminated_length": 535.248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1568, "grad_norm": 0.04022669792175293, "kl": 0.08133697509765625, "learning_rate": 1.4722222222222225e-06, "loss": 0.0078, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03174225613474846, "mask/share_reasoning": 0.8448315262794495, "mask/share_step_conf": 0.11561372131109238, "num_tokens": 35405306.0, "reward": 0.9095112681388855, "reward_std": 0.21041998267173767, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.8593860864639282, "rewards/final_brier_reward_step": 0.6690112948417664, "rewards/format_reward_step": 0.96875, "step": 147 }, { "adv/mean_abs_final_conf": 0.524472713470459, "adv/mean_abs_reasoning": 0.4304084777832031, "adv/mean_abs_step_conf": 0.7623563408851624, "adv/ratio_final_to_reasoning": 1.218546428666389, "adv/ratio_step_to_reasoning": 1.7712391373228513, "adv/std_final_conf": 0.7791754007339478, "adv/std_reasoning": 0.7013992667198181, "adv/std_step_conf": 0.9334371089935303, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.752401059778109, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.2162248995983935, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7469879518072289, "calib/gap": 0.34049180327868855, "calib/mean_conf": 0.773574297188755, "calib/mu_c": 0.8638251366120219, "calib/mu_w": 0.5233333333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12742971887550195, "calib/std_conf": 0.3913611243091092, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4259635666347075, "calib/step_q_c_n": 1043.0, "calib/step_q_gap": 0.13167446710864117, "calib/step_q_w": 0.29428909952606636, "calib/step_q_w_n": 422.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 516.078125, "completions/mean_terminated_length": 520.1417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.15786666666666666, "grad_norm": 0.0584280788898468, "kl": 0.07340240478515625, "learning_rate": 1.4444444444444445e-06, "loss": -0.0201, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03611718863248825, "mask/share_reasoning": 0.8259831070899963, "mask/share_step_conf": 0.13008719682693481, "num_tokens": 35642534.0, "reward": 0.9886473417282104, "reward_std": 0.18284042179584503, "rewards/accuracy_reward_step": 0.71484375, "rewards/asymmetric_l2_reward": 0.8780118227005005, "rewards/final_brier_reward_step": 0.7617827653884888, "rewards/format_reward_step": 0.97265625, "step": 148 }, { "adv/mean_abs_final_conf": 0.566154956817627, "adv/mean_abs_reasoning": 0.4284716844558716, "adv/mean_abs_step_conf": 0.7544533014297485, "adv/ratio_final_to_reasoning": 1.3213357553290908, "adv/ratio_step_to_reasoning": 1.7608008388882228, "adv/std_final_conf": 0.7966720461845398, "adv/std_reasoning": 0.7013741135597229, "adv/std_step_conf": 0.9313024282455444, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8105348988910633, "calib/avg_num_step_conf": 6.25390625, "calib/ece": 0.16796812749003978, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7091633466135459, "calib/gap": 0.5252994129158515, "calib/mean_conf": 0.7432669322709163, "calib/mu_c": 0.9630136986301372, "calib/mu_w": 0.4377142857142857, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16478087649402384, "calib/std_conf": 0.4068463942424941, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4335380835380835, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.15961178112385216, "calib/step_q_w": 0.2739263024142313, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 548.18359375, "completions/mean_terminated_length": 550.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.15893333333333334, "grad_norm": 0.0576673299074173, "kl": 0.062652587890625, "learning_rate": 1.4166666666666667e-06, "loss": -0.0267, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03428906947374344, "mask/share_reasoning": 0.8384957313537598, "mask/share_step_conf": 0.12330888956785202, "num_tokens": 35887325.0, "reward": 1.0077321529388428, "reward_std": 0.17122933268547058, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.9006611108779907, "rewards/final_brier_reward_step": 0.8046468496322632, "rewards/format_reward_step": 0.98046875, "step": 149 }, { "adv/mean_abs_final_conf": 0.6269816160202026, "adv/mean_abs_reasoning": 0.46332746744155884, "adv/mean_abs_step_conf": 0.7745290994644165, "adv/ratio_final_to_reasoning": 1.3532148643859239, "adv/ratio_step_to_reasoning": 1.6716667020439722, "adv/std_final_conf": 0.8205464482307434, "adv/std_reasoning": 0.720609188079834, "adv/std_step_conf": 0.9327038526535034, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7517542652724271, "calib/avg_num_step_conf": 5.65625, "calib/ece": 0.24335999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.732, "calib/gap": 0.3722977435332966, "calib/mean_conf": 0.7546400000000001, "calib/mu_c": 0.8916455696202531, "calib/mu_w": 0.5193478260869565, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18299999999999997, "calib/std_conf": 0.41221071116602487, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42653179190751445, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.10047690339979576, "calib/step_q_w": 0.3260548885077187, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 478.453125, "completions/mean_terminated_length": 478.453125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.16, "grad_norm": 0.025151947513222694, "kl": 0.0756683349609375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0234, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.038418009877204895, "mask/share_reasoning": 0.8237306475639343, "mask/share_step_conf": 0.1378513127565384, "num_tokens": 36114769.0, "reward": 0.9638096690177917, "reward_std": 0.19389088451862335, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8756263256072998, "rewards/final_brier_reward_step": 0.734024167060852, "rewards/format_reward_step": 0.97265625, "step": 150 }, { "adv/mean_abs_final_conf": 0.6179725527763367, "adv/mean_abs_reasoning": 0.427360862493515, "adv/mean_abs_step_conf": 0.7608951330184937, "adv/ratio_final_to_reasoning": 1.4460204642293704, "adv/ratio_step_to_reasoning": 1.7804511357893467, "adv/std_final_conf": 0.8229371905326843, "adv/std_reasoning": 0.7013833522796631, "adv/std_step_conf": 0.9335458874702454, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8082643515714383, "calib/avg_num_step_conf": 6.76171875, "calib/ece": 0.22418032786885236, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.569672131147541, "calib/gap": 0.5015182717544922, "calib/mean_conf": 0.6102459016393442, "calib/mu_c": 0.8712820512820512, "calib/mu_w": 0.36976377952755907, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17745901639344253, "calib/std_conf": 0.4581940012822868, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40887905604719765, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.17155711872526036, "calib/step_q_w": 0.2373219373219373, "calib/step_q_w_n": 1053.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 582.28515625, "completions/mean_terminated_length": 591.52783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.16106666666666666, "grad_norm": 0.031244348734617233, "kl": 0.061309814453125, "learning_rate": 1.3611111111111112e-06, "loss": -0.0475, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030964188277721405, "mask/share_reasoning": 0.8384581804275513, "mask/share_step_conf": 0.11495261639356613, "num_tokens": 36370858.0, "reward": 0.9330952167510986, "reward_std": 0.18629847466945648, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.8467037081718445, "rewards/final_brier_reward_step": 0.7374554872512817, "rewards/format_reward_step": 0.953125, "step": 151 }, { "adv/mean_abs_final_conf": 0.6945838928222656, "adv/mean_abs_reasoning": 0.5879529118537903, "adv/mean_abs_step_conf": 0.7506691217422485, "adv/ratio_final_to_reasoning": 1.1813597293570202, "adv/ratio_step_to_reasoning": 1.276750410803173, "adv/std_final_conf": 0.8834863901138306, "adv/std_reasoning": 0.8265911340713501, "adv/std_step_conf": 0.9328155517578125, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7469903038979632, "calib/avg_num_step_conf": 6.3828125, "calib/ece": 0.3006854838709677, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5967741935483871, "calib/gap": 0.33690440554434803, "calib/mean_conf": 0.6297177419354838, "calib/mu_c": 0.7940944881889762, "calib/mu_w": 0.4571900826446282, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2091532258064516, "calib/std_conf": 0.45284085469977603, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.3859042553191489, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.10618770203116701, "calib/step_q_w": 0.2797165532879819, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 544.1328125, "completions/mean_terminated_length": 548.4172973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.16213333333333332, "grad_norm": 0.04397697374224663, "kl": 0.09905242919921875, "learning_rate": 1.3333333333333334e-06, "loss": -0.0666, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033129818737506866, "mask/share_reasoning": 0.8305137157440186, "mask/share_step_conf": 0.1285439282655716, "num_tokens": 36615548.0, "reward": 0.9151645302772522, "reward_std": 0.22851672768592834, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.8679298162460327, "rewards/final_brier_reward_step": 0.6702117323875427, "rewards/format_reward_step": 0.9609375, "step": 152 }, { "adv/mean_abs_final_conf": 0.6541010141372681, "adv/mean_abs_reasoning": 0.4219273328781128, "adv/mean_abs_step_conf": 0.7459571361541748, "adv/ratio_final_to_reasoning": 1.550269354856482, "adv/ratio_step_to_reasoning": 1.7679753787595183, "adv/std_final_conf": 0.8594304323196411, "adv/std_reasoning": 0.7204716801643372, "adv/std_step_conf": 0.932479202747345, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6297800673667524, "calib/avg_num_step_conf": 5.5625, "calib/ece": 0.3261200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.648, "calib/gap": 0.2313539396341061, "calib/mean_conf": 0.68924, "calib/mu_c": 0.7845578231292518, "calib/mu_w": 0.5532038834951457, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21368000000000006, "calib/std_conf": 0.43288130289953614, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4063384188626907, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.13020755115287558, "calib/step_q_w": 0.2761308677098151, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 520.55078125, "completions/mean_terminated_length": 522.5921630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.1632, "grad_norm": 0.0666937530040741, "kl": 0.0721282958984375, "learning_rate": 1.3055555555555556e-06, "loss": 0.0571, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03248149901628494, "mask/share_reasoning": 0.8496717214584351, "mask/share_step_conf": 0.11394055187702179, "num_tokens": 36856129.0, "reward": 0.9188251495361328, "reward_std": 0.19821104407310486, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.8702654838562012, "rewards/final_brier_reward_step": 0.6564472913742065, "rewards/format_reward_step": 0.9765625, "step": 153 }, { "adv/mean_abs_final_conf": 0.593313455581665, "adv/mean_abs_reasoning": 0.40208834409713745, "adv/mean_abs_step_conf": 0.7636384963989258, "adv/ratio_final_to_reasoning": 1.4755798428176545, "adv/ratio_step_to_reasoning": 1.8991808830311285, "adv/std_final_conf": 0.8072009682655334, "adv/std_reasoning": 0.6816769242286682, "adv/std_step_conf": 0.9328876733779907, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7256451612903225, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.2687550200803214, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6024096385542169, "calib/gap": 0.3908619354838709, "calib/mean_conf": 0.6415261044176708, "calib/mu_c": 0.8377419354838709, "calib/mu_w": 0.44688, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2061445783132531, "calib/std_conf": 0.4517632571194984, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4189130434782608, "calib/step_q_c_n": 644.0, "calib/step_q_gap": 0.12159287422579251, "calib/step_q_w": 0.2973201692524683, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 517.84375, "completions/mean_terminated_length": 519.87451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.16426666666666667, "grad_norm": 0.04594266042113304, "kl": 0.075714111328125, "learning_rate": 1.2777777777777779e-06, "loss": 0.0992, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034510090947151184, "mask/share_reasoning": 0.8454740047454834, "mask/share_step_conf": 0.11610963195562363, "num_tokens": 37093137.0, "reward": 0.9312186241149902, "reward_std": 0.18028542399406433, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.8699989318847656, "rewards/final_brier_reward_step": 0.7010320425033569, "rewards/format_reward_step": 0.97265625, "step": 154 }, { "adv/mean_abs_final_conf": 0.6469031572341919, "adv/mean_abs_reasoning": 0.47271543741226196, "adv/mean_abs_step_conf": 0.7635318040847778, "adv/ratio_final_to_reasoning": 1.3684832481364857, "adv/ratio_step_to_reasoning": 1.6152038703548637, "adv/std_final_conf": 0.8352489471435547, "adv/std_reasoning": 0.7392789125442505, "adv/std_step_conf": 0.9331855773925781, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7134146341463414, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.29729411764705893, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5058823529411764, "calib/gap": 0.32256097560975616, "calib/mean_conf": 0.5472549019607844, "calib/mu_c": 0.7142276422764228, "calib/mu_w": 0.3916666666666666, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18109803921568635, "calib/std_conf": 0.46258641346990637, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.36075822603719593, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.08251463629360617, "calib/step_q_w": 0.27824358974358976, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 478.0625, "completions/mean_terminated_length": 478.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.16533333333333333, "grad_norm": 0.0336977019906044, "kl": 0.08032989501953125, "learning_rate": 1.25e-06, "loss": 0.0138, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03487030789256096, "mask/share_reasoning": 0.8355661034584045, "mask/share_step_conf": 0.12956362962722778, "num_tokens": 37322737.0, "reward": 0.9273616671562195, "reward_std": 0.1708342432975769, "rewards/accuracy_reward_step": 0.48046875, "rewards/asymmetric_l2_reward": 0.8782615661621094, "rewards/final_brier_reward_step": 0.6827117204666138, "rewards/format_reward_step": 0.98828125, "step": 155 }, { "adv/mean_abs_final_conf": 0.6737991571426392, "adv/mean_abs_reasoning": 0.4827231764793396, "adv/mean_abs_step_conf": 0.7515172958374023, "adv/ratio_final_to_reasoning": 1.3958293075067996, "adv/ratio_step_to_reasoning": 1.5568287011169994, "adv/std_final_conf": 0.8648738265037537, "adv/std_reasoning": 0.7206152677536011, "adv/std_step_conf": 0.9330052733421326, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7186783804430862, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.25689243027888453, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5577689243027888, "calib/gap": 0.3968054494525084, "calib/mean_conf": 0.6011155378486056, "calib/mu_c": 0.7892424242424243, "calib/mu_w": 0.3924369747899159, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1660557768924303, "calib/std_conf": 0.4584387486378078, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3498605830164765, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.0351169932728867, "calib/step_q_w": 0.3147435897435898, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 514.51171875, "completions/mean_terminated_length": 514.51171875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1664, "grad_norm": 0.04300342872738838, "kl": 0.08170700073242188, "learning_rate": 1.2222222222222223e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03448965772986412, "mask/share_reasoning": 0.8360552787780762, "mask/share_step_conf": 0.129455104470253, "num_tokens": 37559212.0, "reward": 0.9496381282806396, "reward_std": 0.16709250211715698, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.8816511631011963, "rewards/final_brier_reward_step": 0.7184062004089355, "rewards/format_reward_step": 0.98046875, "step": 156 }, { "adv/mean_abs_final_conf": 0.5912591218948364, "adv/mean_abs_reasoning": 0.49544456601142883, "adv/mean_abs_step_conf": 0.7444217205047607, "adv/ratio_final_to_reasoning": 1.1933910722944479, "adv/ratio_step_to_reasoning": 1.5025328191561769, "adv/std_final_conf": 0.82242351770401, "adv/std_reasoning": 0.7575790882110596, "adv/std_step_conf": 0.933133602142334, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.790162701668034, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.20235059760956173, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6135458167330677, "calib/gap": 0.49839896089691005, "calib/mean_conf": 0.6459362549800797, "calib/mu_c": 0.8286163522012578, "calib/mu_w": 0.3302173913043478, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10741035856573701, "calib/std_conf": 0.45350540384328225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3897894736842106, "calib/step_q_c_n": 1045.0, "calib/step_q_gap": 0.11057481399834668, "calib/step_q_w": 0.2792146596858639, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 497.61328125, "completions/mean_terminated_length": 501.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.16746666666666668, "grad_norm": 0.04095279052853584, "kl": 0.0734100341796875, "learning_rate": 1.1944444444444446e-06, "loss": -0.0316, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03470167517662048, "mask/share_reasoning": 0.8203328847885132, "mask/share_step_conf": 0.13715294003486633, "num_tokens": 37790329.0, "reward": 0.9915717244148254, "reward_std": 0.1898421347141266, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.8848938345909119, "rewards/final_brier_reward_step": 0.7779370546340942, "rewards/format_reward_step": 0.98046875, "step": 157 }, { "adv/mean_abs_final_conf": 0.5748130083084106, "adv/mean_abs_reasoning": 0.41429877281188965, "adv/mean_abs_step_conf": 0.7429898977279663, "adv/ratio_final_to_reasoning": 1.3874359424409923, "adv/ratio_step_to_reasoning": 1.7933673630873082, "adv/std_final_conf": 0.7997155785560608, "adv/std_reasoning": 0.6816416382789612, "adv/std_step_conf": 0.9338192939758301, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7263106796116505, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.27177865612648217, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6086956521739131, "calib/gap": 0.3565825242718448, "calib/mean_conf": 0.6508300395256919, "calib/mu_c": 0.7960000000000002, "calib/mu_w": 0.43941747572815537, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16486166007905134, "calib/std_conf": 0.4487590517510003, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42103761348897534, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.1080310775412629, "calib/step_q_w": 0.31300653594771244, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 500.37109375, "completions/mean_terminated_length": 500.37109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.16853333333333334, "grad_norm": 0.05039157345890999, "kl": 0.0789794921875, "learning_rate": 1.1666666666666668e-06, "loss": 0.0354, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037866123020648956, "mask/share_reasoning": 0.836431622505188, "mask/share_step_conf": 0.12570224702358246, "num_tokens": 38023664.0, "reward": 0.9502644538879395, "reward_std": 0.16423243284225464, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8681695461273193, "rewards/final_brier_reward_step": 0.7175155878067017, "rewards/format_reward_step": 0.98828125, "step": 158 }, { "adv/mean_abs_final_conf": 0.6193236708641052, "adv/mean_abs_reasoning": 0.4926747977733612, "adv/mean_abs_step_conf": 0.7591203451156616, "adv/ratio_final_to_reasoning": 1.2570638353395227, "adv/ratio_step_to_reasoning": 1.540814242064945, "adv/std_final_conf": 0.815812349319458, "adv/std_reasoning": 0.7574940323829651, "adv/std_step_conf": 0.9329251050949097, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6892206410426918, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.3099196787148594, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5140562248995983, "calib/gap": 0.3123513765128343, "calib/mean_conf": 0.5535341365461848, "calib/mu_c": 0.6827397260273973, "calib/mu_w": 0.370388349514563, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13855421686746988, "calib/std_conf": 0.46807771451615177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42608767123287666, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.1379412537873938, "calib/step_q_w": 0.28814641744548286, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 501.59375, "completions/mean_terminated_length": 501.59375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1696, "grad_norm": 0.049436114728450775, "kl": 0.080230712890625, "learning_rate": 1.138888888888889e-06, "loss": 0.0599, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03590218350291252, "mask/share_reasoning": 0.8452440500259399, "mask/share_step_conf": 0.11885374784469604, "num_tokens": 38256856.0, "reward": 0.9326581358909607, "reward_std": 0.1965138167142868, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.8867565393447876, "rewards/final_brier_reward_step": 0.6699659824371338, "rewards/format_reward_step": 0.97265625, "step": 159 }, { "adv/mean_abs_final_conf": 0.651262104511261, "adv/mean_abs_reasoning": 0.4572453498840332, "adv/mean_abs_step_conf": 0.7492605447769165, "adv/ratio_final_to_reasoning": 1.4243165177654282, "adv/ratio_step_to_reasoning": 1.6386400538943575, "adv/std_final_conf": 0.8251065611839294, "adv/std_reasoning": 0.7206948399543762, "adv/std_step_conf": 0.933678925037384, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7278619864379737, "calib/avg_num_step_conf": 5.7578125, "calib/ece": 0.30607287449392717, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5060728744939271, "calib/gap": 0.35182289589150384, "calib/mean_conf": 0.5299595141700404, "calib/mu_c": 0.6852173913043479, "calib/mu_w": 0.33339449541284405, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13866396761133604, "calib/std_conf": 0.4773497160335752, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.40606488011283504, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.13901912847884806, "calib/step_q_w": 0.26704575163398697, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 499.25, "completions/mean_terminated_length": 501.2078857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.17066666666666666, "grad_norm": 0.03691410645842552, "kl": 0.0764007568359375, "learning_rate": 1.111111111111111e-06, "loss": 0.0152, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03438437357544899, "mask/share_reasoning": 0.8391258716583252, "mask/share_step_conf": 0.12258350849151611, "num_tokens": 38489504.0, "reward": 0.9193039536476135, "reward_std": 0.19498516619205475, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.8641306161880493, "rewards/final_brier_reward_step": 0.6736960411071777, "rewards/format_reward_step": 0.96484375, "step": 160 }, { "adv/mean_abs_final_conf": 0.5909967422485352, "adv/mean_abs_reasoning": 0.3688772916793823, "adv/mean_abs_step_conf": 0.748254120349884, "adv/ratio_final_to_reasoning": 1.602149971221901, "adv/ratio_step_to_reasoning": 2.028463495118711, "adv/std_final_conf": 0.8158046007156372, "adv/std_reasoning": 0.6814883947372437, "adv/std_step_conf": 0.9323292374610901, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7071225071225071, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.32039370078740165, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5118110236220472, "calib/gap": 0.3662433862433863, "calib/mean_conf": 0.5425196850393701, "calib/mu_c": 0.6362433862433863, "calib/mu_w": 0.26999999999999996, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05940944881889766, "calib/std_conf": 0.47380262428341774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38986328125, "calib/step_q_c_n": 1024.0, "calib/step_q_gap": 0.11586328125, "calib/step_q_w": 0.274, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 456.0859375, "completions/mean_terminated_length": 456.0859375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.17173333333333332, "grad_norm": 0.10700822621583939, "kl": 0.079345703125, "learning_rate": 1.0833333333333335e-06, "loss": 0.0686, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03930460289120674, "mask/share_reasoning": 0.8258973360061646, "mask/share_step_conf": 0.1347980797290802, "num_tokens": 38710182.0, "reward": 0.9591859579086304, "reward_std": 0.14740516245365143, "rewards/accuracy_reward_step": 0.73828125, "rewards/asymmetric_l2_reward": 0.8936820030212402, "rewards/final_brier_reward_step": 0.6785961389541626, "rewards/format_reward_step": 0.9921875, "step": 161 }, { "adv/mean_abs_final_conf": 0.5586308240890503, "adv/mean_abs_reasoning": 0.4160280227661133, "adv/mean_abs_step_conf": 0.7616361975669861, "adv/ratio_final_to_reasoning": 1.3427721055297923, "adv/ratio_step_to_reasoning": 1.8307329215541093, "adv/std_final_conf": 0.7841950058937073, "adv/std_reasoning": 0.701278567314148, "adv/std_step_conf": 0.9325926899909973, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8157858707557503, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.20877952755905524, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5669291338582677, "calib/gap": 0.5165840635268346, "calib/mean_conf": 0.6009055118110236, "calib/mu_c": 0.7798795180722891, "calib/mu_w": 0.2632954545454545, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07807086614173239, "calib/std_conf": 0.4637407457415819, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4075116279069767, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.1085157262676324, "calib/step_q_w": 0.2989959016393443, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 442.27734375, "completions/mean_terminated_length": 444.01177978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1728, "grad_norm": 0.08423297107219696, "kl": 0.08339691162109375, "learning_rate": 1.0555555555555557e-06, "loss": 0.0107, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03754296526312828, "mask/share_reasoning": 0.8310139179229736, "mask/share_step_conf": 0.1275368630886078, "num_tokens": 38927549.0, "reward": 0.999754786491394, "reward_std": 0.14653810858726501, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.8925575613975525, "rewards/final_brier_reward_step": 0.7796082496643066, "rewards/format_reward_step": 0.98828125, "step": 162 }, { "adv/mean_abs_final_conf": 0.5742782354354858, "adv/mean_abs_reasoning": 0.4988449811935425, "adv/mean_abs_step_conf": 0.7395628690719604, "adv/ratio_final_to_reasoning": 1.1512158227220426, "adv/ratio_step_to_reasoning": 1.4825504855285372, "adv/std_final_conf": 0.8037339448928833, "adv/std_reasoning": 0.7575864791870117, "adv/std_step_conf": 0.9324069023132324, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7783159227603672, "calib/avg_num_step_conf": 6.00390625, "calib/ece": 0.22789682539682532, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.46825396825396826, "calib/gap": 0.4833561253561253, "calib/mean_conf": 0.5043253968253968, "calib/mu_c": 0.7287407407407407, "calib/mu_w": 0.24538461538461537, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09825396825396818, "calib/std_conf": 0.47483825699640314, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4153507565337002, "calib/step_q_c_n": 727.0, "calib/step_q_gap": 0.15149890468184835, "calib/step_q_w": 0.26385185185185184, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 545.7578125, "completions/mean_terminated_length": 547.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.17386666666666667, "grad_norm": 0.02793508768081665, "kl": 0.07421112060546875, "learning_rate": 1.0277777777777777e-06, "loss": -0.0248, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03470218926668167, "mask/share_reasoning": 0.8333349227905273, "mask/share_step_conf": 0.1280566155910492, "num_tokens": 39172095.0, "reward": 0.9705761671066284, "reward_std": 0.16870234906673431, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.8863610029220581, "rewards/final_brier_reward_step": 0.7532289028167725, "rewards/format_reward_step": 0.98046875, "step": 163 }, { "adv/mean_abs_final_conf": 0.6500420570373535, "adv/mean_abs_reasoning": 0.44920510053634644, "adv/mean_abs_step_conf": 0.7497342824935913, "adv/ratio_final_to_reasoning": 1.4470941141612368, "adv/ratio_step_to_reasoning": 1.6690244202445965, "adv/std_final_conf": 0.8430293202400208, "adv/std_reasoning": 0.7015290856361389, "adv/std_step_conf": 0.9337033033370972, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.770204987596292, "calib/avg_num_step_conf": 5.8984375, "calib/ece": 0.25626506024096396, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5301204819277109, "calib/gap": 0.42616921269095165, "calib/mean_conf": 0.5546586345381527, "calib/mu_c": 0.7446376811594202, "calib/mu_w": 0.31846846846846855, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12835341365461853, "calib/std_conf": 0.47459640847724605, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3912232030264817, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.10328735923289734, "calib/step_q_w": 0.2879358437935844, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 551.2109375, "completions/mean_terminated_length": 557.7470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.17493333333333333, "grad_norm": 0.06633436679840088, "kl": 0.07451629638671875, "learning_rate": 1.0000000000000002e-06, "loss": -0.0535, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030875276774168015, "mask/share_reasoning": 0.8452262282371521, "mask/share_step_conf": 0.11217975616455078, "num_tokens": 39419341.0, "reward": 0.9514139890670776, "reward_std": 0.2097827047109604, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.8823947906494141, "rewards/final_brier_reward_step": 0.7180894613265991, "rewards/format_reward_step": 0.97265625, "step": 164 }, { "adv/mean_abs_final_conf": 0.5945655107498169, "adv/mean_abs_reasoning": 0.49014484882354736, "adv/mean_abs_step_conf": 0.7411804795265198, "adv/ratio_final_to_reasoning": 1.2130404148424725, "adv/ratio_step_to_reasoning": 1.5121662123054271, "adv/std_final_conf": 0.8082362413406372, "adv/std_reasoning": 0.7575728297233582, "adv/std_step_conf": 0.9333819150924683, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7232782898105479, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.2599600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.572, "calib/gap": 0.4444726062467999, "calib/mean_conf": 0.60276, "calib/mu_c": 0.8267741935483872, "calib/mu_w": 0.3823015873015873, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18336000000000005, "calib/std_conf": 0.47151583472880315, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4217050691244239, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.1091423214361808, "calib/step_q_w": 0.3125627476882431, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 535.36328125, "completions/mean_terminated_length": 535.36328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.176, "grad_norm": 0.06460738927125931, "kl": 0.12464141845703125, "learning_rate": 9.722222222222224e-07, "loss": -0.0664, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03398456051945686, "mask/share_reasoning": 0.8485789895057678, "mask/share_step_conf": 0.11743646115064621, "num_tokens": 39661970.0, "reward": 0.9417548775672913, "reward_std": 0.1901148110628128, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.8748019337654114, "rewards/final_brier_reward_step": 0.7173015475273132, "rewards/format_reward_step": 0.97265625, "step": 165 }, { "adv/mean_abs_final_conf": 0.4994466304779053, "adv/mean_abs_reasoning": 0.37650659680366516, "adv/mean_abs_step_conf": 0.7547671794891357, "adv/ratio_final_to_reasoning": 1.326528232753247, "adv/ratio_step_to_reasoning": 2.004658579415861, "adv/std_final_conf": 0.7379820942878723, "adv/std_reasoning": 0.6612639427185059, "adv/std_step_conf": 0.9327965378761292, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8125730994152047, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.18629482071713144, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.5553486842105264, "calib/mean_conf": 0.6624701195219124, "calib/mu_c": 0.8394736842105264, "calib/mu_w": 0.284125, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08374501992031869, "calib/std_conf": 0.4574895489015776, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4180082559339525, "calib/step_q_c_n": 969.0, "calib/step_q_gap": 0.15486733912580308, "calib/step_q_w": 0.26314091680814944, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 531.1875, "completions/mean_terminated_length": 533.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.17706666666666668, "grad_norm": 0.0442763976752758, "kl": 0.06806182861328125, "learning_rate": 9.444444444444445e-07, "loss": -0.0328, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032991744577884674, "mask/share_reasoning": 0.8350811004638672, "mask/share_step_conf": 0.12802088260650635, "num_tokens": 39904138.0, "reward": 1.0148489475250244, "reward_std": 0.15825651586055756, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.9015299081802368, "rewards/final_brier_reward_step": 0.7984804511070251, "rewards/format_reward_step": 0.98046875, "step": 166 }, { "adv/mean_abs_final_conf": 0.48053890466690063, "adv/mean_abs_reasoning": 0.4025387167930603, "adv/mean_abs_step_conf": 0.7606292963027954, "adv/ratio_final_to_reasoning": 1.1937706476913603, "adv/ratio_step_to_reasoning": 1.8895804666009919, "adv/std_final_conf": 0.7448135614395142, "adv/std_reasoning": 0.6815594434738159, "adv/std_step_conf": 0.9320181608200073, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6876629324314802, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.23704724409448819, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7637795275590551, "calib/gap": 0.3711963644049886, "calib/mean_conf": 0.770984251968504, "calib/mu_c": 0.8922807017543862, "calib/mu_w": 0.5210843373493976, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16740157480314963, "calib/std_conf": 0.4104494077074939, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4270010235414535, "calib/step_q_c_n": 977.0, "calib/step_q_gap": 0.05418249072292064, "calib/step_q_w": 0.37281853281853283, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 474.296875, "completions/mean_terminated_length": 476.1568908691406, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.17813333333333334, "grad_norm": 0.051055651158094406, "kl": 0.07321929931640625, "learning_rate": 9.166666666666666e-07, "loss": -0.0208, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03513041511178017, "mask/share_reasoning": 0.8342991471290588, "mask/share_step_conf": 0.1266641467809677, "num_tokens": 40131166.0, "reward": 0.9845085144042969, "reward_std": 0.1467195451259613, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.8823486566543579, "rewards/final_brier_reward_step": 0.7554183006286621, "rewards/format_reward_step": 0.98828125, "step": 167 }, { "adv/mean_abs_final_conf": 0.5689985752105713, "adv/mean_abs_reasoning": 0.5295567512512207, "adv/mean_abs_step_conf": 0.7315965294837952, "adv/ratio_final_to_reasoning": 1.0744808254566836, "adv/ratio_step_to_reasoning": 1.3815262061246523, "adv/std_final_conf": 0.7971473336219788, "adv/std_reasoning": 0.7928540110588074, "adv/std_step_conf": 0.9335496425628662, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.712508809020437, "calib/avg_num_step_conf": 6.125, "calib/ece": 0.2544621513944223, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6892430278884463, "calib/gap": 0.3601057082452431, "calib/mean_conf": 0.7120717131474104, "calib/mu_c": 0.8354545454545454, "calib/mu_w": 0.4753488372093024, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15458167330677292, "calib/std_conf": 0.4352474348213658, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4282789651293588, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.17523036424570637, "calib/step_q_w": 0.25304860088365244, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 566.99609375, "completions/mean_terminated_length": 569.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.1792, "grad_norm": 0.031764764338731766, "kl": 0.06189727783203125, "learning_rate": 8.88888888888889e-07, "loss": 0.0167, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03189847618341446, "mask/share_reasoning": 0.8458576798439026, "mask/share_step_conf": 0.11833761632442474, "num_tokens": 40380989.0, "reward": 0.9734334945678711, "reward_std": 0.20216943323612213, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.8927955031394958, "rewards/final_brier_reward_step": 0.7298526763916016, "rewards/format_reward_step": 0.9765625, "step": 168 }, { "adv/mean_abs_final_conf": 0.5428816080093384, "adv/mean_abs_reasoning": 0.45098912715911865, "adv/mean_abs_step_conf": 0.7654911279678345, "adv/ratio_final_to_reasoning": 1.2037576414071687, "adv/ratio_step_to_reasoning": 1.6973604946749699, "adv/std_final_conf": 0.7792088985443115, "adv/std_reasoning": 0.7014132142066956, "adv/std_step_conf": 0.9325076341629028, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.709286971830986, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.29669291338582693, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6929133858267716, "calib/gap": 0.3395787223340041, "calib/mean_conf": 0.718503937007874, "calib/mu_c": 0.8682394366197184, "calib/mu_w": 0.5286607142857143, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2280708661417324, "calib/std_conf": 0.4331410250972485, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4691484049930652, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.12826460793905542, "calib/step_q_w": 0.3408837970540098, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 477.62890625, "completions/mean_terminated_length": 479.5019836425781, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.18026666666666666, "grad_norm": 0.036118652671575546, "kl": 0.0745849609375, "learning_rate": 8.611111111111112e-07, "loss": 0.0798, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03544265776872635, "mask/share_reasoning": 0.8431687355041504, "mask/share_step_conf": 0.11748235672712326, "num_tokens": 40607446.0, "reward": 0.9495621919631958, "reward_std": 0.17833659052848816, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8874062895774841, "rewards/final_brier_reward_step": 0.7023429870605469, "rewards/format_reward_step": 0.9921875, "step": 169 }, { "adv/mean_abs_final_conf": 0.5604931116104126, "adv/mean_abs_reasoning": 0.47254762053489685, "adv/mean_abs_step_conf": 0.7200212478637695, "adv/ratio_final_to_reasoning": 1.1861092665665451, "adv/ratio_step_to_reasoning": 1.523700927853042, "adv/std_final_conf": 0.7950620651245117, "adv/std_reasoning": 0.7573960423469543, "adv/std_step_conf": 0.9339142441749573, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7524666666666667, "calib/avg_num_step_conf": 6.33984375, "calib/ece": 0.23223999999999984, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.732, "calib/gap": 0.4186000000000002, "calib/mean_conf": 0.75776, "calib/mu_c": 0.9252000000000002, "calib/mu_w": 0.5066, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19499999999999984, "calib/std_conf": 0.4107035212899933, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4403399122807018, "calib/step_q_c_n": 912.0, "calib/step_q_gap": 0.1492147364719817, "calib/step_q_w": 0.2911251758087201, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 536.828125, "completions/mean_terminated_length": 536.828125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.18133333333333335, "grad_norm": 0.04905329644680023, "kl": 0.06963348388671875, "learning_rate": 8.333333333333333e-07, "loss": 0.0278, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03186079114675522, "mask/share_reasoning": 0.8381197452545166, "mask/share_step_conf": 0.13001945614814758, "num_tokens": 40849026.0, "reward": 0.972845196723938, "reward_std": 0.2081618309020996, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8838129043579102, "rewards/final_brier_reward_step": 0.7493773698806763, "rewards/format_reward_step": 0.9765625, "step": 170 }, { "adv/mean_abs_final_conf": 0.6511637568473816, "adv/mean_abs_reasoning": 0.5285821557044983, "adv/mean_abs_step_conf": 0.7396926879882812, "adv/ratio_final_to_reasoning": 1.2319064308546428, "adv/ratio_step_to_reasoning": 1.3993901988658948, "adv/std_final_conf": 0.8735871315002441, "adv/std_reasoning": 0.79267817735672, "adv/std_step_conf": 0.9336190223693848, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.708744492025073, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.2965354330708661, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6496062992125984, "calib/gap": 0.3646049773474835, "calib/mean_conf": 0.6776377952755905, "calib/mu_c": 0.8541984732824428, "calib/mu_w": 0.4895934959349593, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22921259842519678, "calib/std_conf": 0.4501171443201061, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4538608458390178, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.15528319707268978, "calib/step_q_w": 0.29857764876632803, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 484.640625, "completions/mean_terminated_length": 486.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.1824, "grad_norm": 0.04068433865904808, "kl": 0.06620025634765625, "learning_rate": 8.055555555555557e-07, "loss": -0.0137, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03494654595851898, "mask/share_reasoning": 0.8401431441307068, "mask/share_step_conf": 0.12100405246019363, "num_tokens": 41079990.0, "reward": 0.9433212876319885, "reward_std": 0.1982666552066803, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.8885831832885742, "rewards/final_brier_reward_step": 0.6980593800544739, "rewards/format_reward_step": 0.98828125, "step": 171 }, { "adv/mean_abs_final_conf": 0.5560652017593384, "adv/mean_abs_reasoning": 0.41541504859924316, "adv/mean_abs_step_conf": 0.755997896194458, "adv/ratio_final_to_reasoning": 1.3385774146467728, "adv/ratio_step_to_reasoning": 1.819861602856328, "adv/std_final_conf": 0.7791886925697327, "adv/std_reasoning": 0.6816006898880005, "adv/std_step_conf": 0.9320200085639954, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6219101553531798, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.2877777777777778, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8412698412698413, "calib/gap": 0.13524450906864627, "calib/mean_conf": 0.8542857142857143, "calib/mu_c": 0.893463687150838, "calib/mu_w": 0.7582191780821917, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2158730158730159, "calib/std_conf": 0.3380638818457493, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42098344693281403, "calib/step_q_c_n": 1027.0, "calib/step_q_gap": 0.06116906178200193, "calib/step_q_w": 0.3598143851508121, "calib/step_q_w_n": 431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 474.8515625, "completions/mean_terminated_length": 476.7137451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.18346666666666667, "grad_norm": 0.029070017859339714, "kl": 0.08162689208984375, "learning_rate": 7.777777777777779e-07, "loss": 0.0266, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03529661148786545, "mask/share_reasoning": 0.8278791904449463, "mask/share_step_conf": 0.13291800022125244, "num_tokens": 41304904.0, "reward": 0.9602549076080322, "reward_std": 0.18681581318378448, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.8800837993621826, "rewards/final_brier_reward_step": 0.7037070393562317, "rewards/format_reward_step": 0.984375, "step": 172 }, { "adv/mean_abs_final_conf": 0.5546171069145203, "adv/mean_abs_reasoning": 0.4558177590370178, "adv/mean_abs_step_conf": 0.7512186169624329, "adv/ratio_final_to_reasoning": 1.2167518617226116, "adv/ratio_step_to_reasoning": 1.6480679000956278, "adv/std_final_conf": 0.793694019317627, "adv/std_reasoning": 0.7205135226249695, "adv/std_step_conf": 0.9342923164367676, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6639427641394277, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.34315789473684216, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8987854251012146, "calib/gap": 0.16451105384511044, "calib/mean_conf": 0.9097165991902834, "calib/mu_c": 0.976986301369863, "calib/mu_w": 0.8124752475247525, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3308906882591094, "calib/std_conf": 0.2742769675103127, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4677382319173364, "calib/step_q_c_n": 871.0, "calib/step_q_gap": 0.05031398949309396, "calib/step_q_w": 0.4174242424242424, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 515.125, "completions/mean_terminated_length": 517.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.18453333333333333, "grad_norm": 0.03989225625991821, "kl": 0.07109832763671875, "learning_rate": 7.5e-07, "loss": 0.0486, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035245005041360855, "mask/share_reasoning": 0.8358505964279175, "mask/share_step_conf": 0.12499810010194778, "num_tokens": 41539936.0, "reward": 0.8893845081329346, "reward_std": 0.1888009011745453, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.8339041471481323, "rewards/final_brier_reward_step": 0.6378335952758789, "rewards/format_reward_step": 0.96484375, "step": 173 }, { "adv/mean_abs_final_conf": 0.6846530437469482, "adv/mean_abs_reasoning": 0.6147565841674805, "adv/mean_abs_step_conf": 0.7412719130516052, "adv/ratio_final_to_reasoning": 1.1136977811699624, "adv/ratio_step_to_reasoning": 1.2057974361599642, "adv/std_final_conf": 0.8761252164840698, "adv/std_reasoning": 0.8266335725784302, "adv/std_step_conf": 0.9345950484275818, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6985884485884486, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.3396385542168675, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6224899598393574, "calib/gap": 0.318115773115773, "calib/mean_conf": 0.6447791164658635, "calib/mu_c": 0.8134188034188033, "calib/mu_w": 0.49530303030303036, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2572690763052209, "calib/std_conf": 0.46009433663111154, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40963017751479286, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.07976270763527482, "calib/step_q_w": 0.32986746987951804, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 582.76171875, "completions/mean_terminated_length": 582.76171875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1856, "grad_norm": 0.03356742113828659, "kl": 0.06365203857421875, "learning_rate": 7.222222222222222e-07, "loss": -0.0127, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03057742491364479, "mask/share_reasoning": 0.8542848825454712, "mask/share_step_conf": 0.11513769626617432, "num_tokens": 41793355.0, "reward": 0.8853936195373535, "reward_std": 0.238206684589386, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.8359798192977905, "rewards/final_brier_reward_step": 0.6488698720932007, "rewards/format_reward_step": 0.97265625, "step": 174 }, { "adv/mean_abs_final_conf": 0.6600933074951172, "adv/mean_abs_reasoning": 0.49963003396987915, "adv/mean_abs_step_conf": 0.7545915842056274, "adv/ratio_final_to_reasoning": 1.3211641867288382, "adv/ratio_step_to_reasoning": 1.5103006883111414, "adv/std_final_conf": 0.8539295792579651, "adv/std_reasoning": 0.7394165992736816, "adv/std_step_conf": 0.9340283274650574, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6916584564860426, "calib/avg_num_step_conf": 6.4453125, "calib/ece": 0.35928, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.656, "calib/gap": 0.317983579638752, "calib/mean_conf": 0.67776, "calib/mu_c": 0.8621904761904762, "calib/mu_w": 0.5442068965517242, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30852, "calib/std_conf": 0.45016061844634964, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4147289156626506, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.12084149172755931, "calib/step_q_w": 0.29388742393509126, "calib/step_q_w_n": 986.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 553.91015625, "completions/mean_terminated_length": 558.2716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.18666666666666668, "grad_norm": 0.06960975378751755, "kl": 0.06383132934570312, "learning_rate": 6.944444444444446e-07, "loss": -0.0632, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03212447091937065, "mask/share_reasoning": 0.834095299243927, "mask/share_step_conf": 0.12596774101257324, "num_tokens": 42040980.0, "reward": 0.8779298067092896, "reward_std": 0.2234022468328476, "rewards/accuracy_reward_step": 0.4140625, "rewards/asymmetric_l2_reward": 0.8505501747131348, "rewards/final_brier_reward_step": 0.6271843910217285, "rewards/format_reward_step": 0.9765625, "step": 175 }, { "adv/mean_abs_final_conf": 0.5671244859695435, "adv/mean_abs_reasoning": 0.45925626158714294, "adv/mean_abs_step_conf": 0.7286878228187561, "adv/ratio_final_to_reasoning": 1.2348758926217334, "adv/ratio_step_to_reasoning": 1.5866693255318611, "adv/std_final_conf": 0.7785314321517944, "adv/std_reasoning": 0.7206246852874756, "adv/std_step_conf": 0.9327684640884399, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7994992295839753, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.23900000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.68, "calib/gap": 0.47578839239856185, "calib/mean_conf": 0.70164, "calib/mu_c": 0.9262121212121212, "calib/mu_w": 0.4504237288135593, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20632000000000003, "calib/std_conf": 0.4420272733667008, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4255120101137801, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.15050537000753839, "calib/step_q_w": 0.2750066401062417, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 526.4375, "completions/mean_terminated_length": 526.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.18773333333333334, "grad_norm": 0.07707049697637558, "kl": 0.0714874267578125, "learning_rate": 6.666666666666667e-07, "loss": -0.0188, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.036508336663246155, "mask/share_reasoning": 0.8322881460189819, "mask/share_step_conf": 0.13120350241661072, "num_tokens": 42279812.0, "reward": 0.9646437764167786, "reward_std": 0.1994141936302185, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.886325478553772, "rewards/final_brier_reward_step": 0.7445245981216431, "rewards/format_reward_step": 0.9765625, "step": 176 }, { "adv/mean_abs_final_conf": 0.6146119832992554, "adv/mean_abs_reasoning": 0.497945100069046, "adv/mean_abs_step_conf": 0.7429401278495789, "adv/ratio_final_to_reasoning": 1.234296678919086, "adv/ratio_step_to_reasoning": 1.4920121269323894, "adv/std_final_conf": 0.8337008953094482, "adv/std_reasoning": 0.7752436995506287, "adv/std_step_conf": 0.9334618449211121, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7455472379969024, "calib/avg_num_step_conf": 6.1015625, "calib/ece": 0.26134387351778665, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6324110671936759, "calib/gap": 0.40504710893133716, "calib/mean_conf": 0.6538339920948617, "calib/mu_c": 0.8203355704697987, "calib/mu_w": 0.4152884615384615, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16312252964426888, "calib/std_conf": 0.45910542430164514, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4397441860465117, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.13854048234280797, "calib/step_q_w": 0.30120370370370375, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 505.9140625, "completions/mean_terminated_length": 507.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.1888, "grad_norm": 0.03708864748477936, "kl": 0.08526611328125, "learning_rate": 6.388888888888889e-07, "loss": -0.016, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033189140260219574, "mask/share_reasoning": 0.8366128206253052, "mask/share_step_conf": 0.12629178166389465, "num_tokens": 42513158.0, "reward": 0.9594067335128784, "reward_std": 0.21088215708732605, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8830938935279846, "rewards/final_brier_reward_step": 0.7232195138931274, "rewards/format_reward_step": 0.98046875, "step": 177 }, { "adv/mean_abs_final_conf": 0.6202833652496338, "adv/mean_abs_reasoning": 0.4997982382774353, "adv/mean_abs_step_conf": 0.7502584457397461, "adv/ratio_final_to_reasoning": 1.2410675303447505, "adv/ratio_step_to_reasoning": 1.501122629654572, "adv/std_final_conf": 0.8151530623435974, "adv/std_reasoning": 0.7575740814208984, "adv/std_step_conf": 0.9331055879592896, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.803671928620453, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.20469879518072293, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7228915662650602, "calib/gap": 0.4577343857240904, "calib/mean_conf": 0.7482329317269076, "calib/mu_c": 0.921032258064516, "calib/mu_w": 0.46329787234042563, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1652208835341366, "calib/std_conf": 0.41368664047303105, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46093316519546024, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.19946558840365136, "calib/step_q_w": 0.2614675767918089, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 474.35546875, "completions/mean_terminated_length": 478.0905456542969, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.18986666666666666, "grad_norm": 0.04096180945634842, "kl": 0.08055877685546875, "learning_rate": 6.111111111111112e-07, "loss": 0.0305, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035738177597522736, "mask/share_reasoning": 0.8296140432357788, "mask/share_step_conf": 0.12683530151844025, "num_tokens": 42740665.0, "reward": 0.9872586727142334, "reward_std": 0.21035568416118622, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.8873934745788574, "rewards/final_brier_reward_step": 0.7714987993240356, "rewards/format_reward_step": 0.97265625, "step": 178 }, { "adv/mean_abs_final_conf": 0.5899964570999146, "adv/mean_abs_reasoning": 0.5133465528488159, "adv/mean_abs_step_conf": 0.7466897964477539, "adv/ratio_final_to_reasoning": 1.1493141501111288, "adv/ratio_step_to_reasoning": 1.454553054469734, "adv/std_final_conf": 0.7958014011383057, "adv/std_reasoning": 0.7394075989723206, "adv/std_step_conf": 0.9330047965049744, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7870469798657718, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.2223293172690764, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6947791164658634, "calib/gap": 0.44986040268456373, "calib/mean_conf": 0.7183935742971888, "calib/mu_c": 0.8990604026845638, "calib/mu_w": 0.44920000000000004, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17116465863453825, "calib/std_conf": 0.42960127880849125, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42761737089201884, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.14840945009993967, "calib/step_q_w": 0.2792079207920792, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 505.24609375, "completions/mean_terminated_length": 509.2243957519531, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.19093333333333334, "grad_norm": 0.0683615505695343, "kl": 0.069488525390625, "learning_rate": 5.833333333333334e-07, "loss": -0.0626, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034047625958919525, "mask/share_reasoning": 0.83702552318573, "mask/share_step_conf": 0.1211143285036087, "num_tokens": 42976272.0, "reward": 0.9702746272087097, "reward_std": 0.20280179381370544, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.878600001335144, "rewards/final_brier_reward_step": 0.7517929673194885, "rewards/format_reward_step": 0.96875, "step": 179 }, { "adv/mean_abs_final_conf": 0.5583059787750244, "adv/mean_abs_reasoning": 0.3551320731639862, "adv/mean_abs_step_conf": 0.7670993804931641, "adv/ratio_final_to_reasoning": 1.5721080154796956, "adv/ratio_step_to_reasoning": 2.160039710462725, "adv/std_final_conf": 0.7800789475440979, "adv/std_reasoning": 0.6403230428695679, "adv/std_step_conf": 0.9319993853569031, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.752177759629781, "calib/avg_num_step_conf": 6.31640625, "calib/ece": 0.2102788844621513, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7051792828685259, "calib/gap": 0.463054988430652, "calib/mean_conf": 0.7283665338645419, "calib/mu_c": 0.8999367088607595, "calib/mu_w": 0.4368817204301075, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1545816733067728, "calib/std_conf": 0.42847747819257054, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4166522210184182, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.1318827685688505, "calib/step_q_w": 0.2847694524495677, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 572.82421875, "completions/mean_terminated_length": 577.3346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.192, "grad_norm": 0.04807959124445915, "kl": 0.065093994140625, "learning_rate": 5.555555555555555e-07, "loss": 0.001, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0307810977101326, "mask/share_reasoning": 0.8431800603866577, "mask/share_step_conf": 0.1182263046503067, "num_tokens": 43226771.0, "reward": 0.9837304353713989, "reward_std": 0.17399966716766357, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8786393404006958, "rewards/final_brier_reward_step": 0.7700715065002441, "rewards/format_reward_step": 0.9765625, "step": 180 }, { "adv/mean_abs_final_conf": 0.5820561647415161, "adv/mean_abs_reasoning": 0.41340193152427673, "adv/mean_abs_step_conf": 0.7540974617004395, "adv/ratio_final_to_reasoning": 1.4079667276719903, "adv/ratio_step_to_reasoning": 1.824126604633814, "adv/std_final_conf": 0.8107714653015137, "adv/std_reasoning": 0.6816644072532654, "adv/std_step_conf": 0.933193027973175, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7967366557045283, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.27083333333333326, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.4364672400708322, "calib/mean_conf": 0.6251984126984127, "calib/mu_c": 0.8572881355932203, "calib/mu_w": 0.42082089552238805, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2138888888888888, "calib/std_conf": 0.46455683814846205, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42562974203338394, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.15130175189774892, "calib/step_q_w": 0.274327990135635, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 470.2734375, "completions/mean_terminated_length": 473.97637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.19306666666666666, "grad_norm": 0.045435406267642975, "kl": 0.08133697509765625, "learning_rate": 5.277777777777779e-07, "loss": -0.0131, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03530710190534592, "mask/share_reasoning": 0.8284176588058472, "mask/share_step_conf": 0.12846270203590393, "num_tokens": 43453425.0, "reward": 0.9462225437164307, "reward_std": 0.16522014141082764, "rewards/accuracy_reward_step": 0.4609375, "rewards/asymmetric_l2_reward": 0.8868392705917358, "rewards/final_brier_reward_step": 0.7165433168411255, "rewards/format_reward_step": 0.984375, "step": 181 }, { "adv/mean_abs_final_conf": 0.5667697191238403, "adv/mean_abs_reasoning": 0.4072721302509308, "adv/mean_abs_step_conf": 0.7169884443283081, "adv/ratio_final_to_reasoning": 1.3916241181900637, "adv/ratio_step_to_reasoning": 1.7604652787966444, "adv/std_final_conf": 0.7674015164375305, "adv/std_reasoning": 0.6816434860229492, "adv/std_step_conf": 0.9322676658630371, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7537513208876364, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.2442857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7420634920634921, "calib/gap": 0.35981683691440625, "calib/mean_conf": 0.764920634920635, "calib/mu_c": 0.8862874251497005, "calib/mu_w": 0.5264705882352942, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17325396825396824, "calib/std_conf": 0.4069012711885218, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4113034623217922, "calib/step_q_c_n": 982.0, "calib/step_q_gap": 0.12497219297194695, "calib/step_q_w": 0.28633126934984526, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 526.54296875, "completions/mean_terminated_length": 530.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.19413333333333332, "grad_norm": 0.031100839376449585, "kl": 0.06735992431640625, "learning_rate": 5.000000000000001e-07, "loss": -0.0536, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031918980181217194, "mask/share_reasoning": 0.8300646543502808, "mask/share_step_conf": 0.13020385801792145, "num_tokens": 43694380.0, "reward": 0.9876556396484375, "reward_std": 0.1890118420124054, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.8993324041366577, "rewards/final_brier_reward_step": 0.7494163513183594, "rewards/format_reward_step": 0.98046875, "step": 182 }, { "adv/mean_abs_final_conf": 0.6406201720237732, "adv/mean_abs_reasoning": 0.4643709659576416, "adv/mean_abs_step_conf": 0.7473157644271851, "adv/ratio_final_to_reasoning": 1.37954398312277, "adv/ratio_step_to_reasoning": 1.6093076854752215, "adv/std_final_conf": 0.844761073589325, "adv/std_reasoning": 0.7206538319587708, "adv/std_step_conf": 0.9331881999969482, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.707851110416015, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.3112598425196852, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6181102362204725, "calib/gap": 0.3128683140444166, "calib/mean_conf": 0.6425196850393701, "calib/mu_c": 0.7841726618705036, "calib/mu_w": 0.471304347826087, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20326771653543318, "calib/std_conf": 0.46144068898940727, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40705179282868525, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.12211114000969414, "calib/step_q_w": 0.2849406528189911, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 522.02734375, "completions/mean_terminated_length": 522.02734375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1952, "grad_norm": 0.035230036824941635, "kl": 0.067352294921875, "learning_rate": 4.7222222222222226e-07, "loss": -0.1027, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03272725269198418, "mask/share_reasoning": 0.8501238226890564, "mask/share_step_conf": 0.11714892089366913, "num_tokens": 43934699.0, "reward": 0.9388773441314697, "reward_std": 0.19883155822753906, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8908123970031738, "rewards/final_brier_reward_step": 0.6799108982086182, "rewards/format_reward_step": 0.9921875, "step": 183 }, { "adv/mean_abs_final_conf": 0.599378764629364, "adv/mean_abs_reasoning": 0.5096433162689209, "adv/mean_abs_step_conf": 0.7571796178817749, "adv/ratio_final_to_reasoning": 1.1760750028419735, "adv/ratio_step_to_reasoning": 1.4857049895700738, "adv/std_final_conf": 0.8223716020584106, "adv/std_reasoning": 0.757595956325531, "adv/std_step_conf": 0.9336126446723938, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7833777481678881, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.18841897233201577, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.691699604743083, "calib/gap": 0.5367495003331113, "calib/mean_conf": 0.7044664031620554, "calib/mu_c": 0.9060126582278482, "calib/mu_w": 0.36926315789473685, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13418972332015805, "calib/std_conf": 0.44528026864656384, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4251701427003294, "calib/step_q_c_n": 911.0, "calib/step_q_gap": 0.13512523252068864, "calib/step_q_w": 0.29004491017964074, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 522.33203125, "completions/mean_terminated_length": 522.33203125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.19626666666666667, "grad_norm": 0.03629198670387268, "kl": 0.06891632080078125, "learning_rate": 4.444444444444445e-07, "loss": 0.0815, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032990530133247375, "mask/share_reasoning": 0.843195378780365, "mask/share_step_conf": 0.12381406873464584, "num_tokens": 44173696.0, "reward": 1.0000405311584473, "reward_std": 0.20342886447906494, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.8806300759315491, "rewards/final_brier_reward_step": 0.7991386651992798, "rewards/format_reward_step": 0.984375, "step": 184 }, { "adv/mean_abs_final_conf": 0.5196036696434021, "adv/mean_abs_reasoning": 0.3922334909439087, "adv/mean_abs_step_conf": 0.7511488199234009, "adv/ratio_final_to_reasoning": 1.3247305027242255, "adv/ratio_step_to_reasoning": 1.9150552853499674, "adv/std_final_conf": 0.7578298449516296, "adv/std_reasoning": 0.6816517114639282, "adv/std_step_conf": 0.9322458505630493, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.790268456375839, "calib/avg_num_step_conf": 6.34765625, "calib/ece": 0.18509960159362543, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6573705179282868, "calib/gap": 0.5555230951440977, "calib/mean_conf": 0.6780079681274901, "calib/mu_c": 0.9037583892617448, "calib/mu_w": 0.3482352941176471, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1347410358565736, "calib/std_conf": 0.45274111572155207, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4383154121863799, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.1932646507650601, "calib/step_q_w": 0.24505076142131982, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 518.94921875, "completions/mean_terminated_length": 520.984375, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.19733333333333333, "grad_norm": 0.04340367391705513, "kl": 0.06716156005859375, "learning_rate": 4.1666666666666667e-07, "loss": -0.041, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03309101611375809, "mask/share_reasoning": 0.8382681012153625, "mask/share_step_conf": 0.12473461031913757, "num_tokens": 44413467.0, "reward": 0.9960746765136719, "reward_std": 0.16638922691345215, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8855493068695068, "rewards/final_brier_reward_step": 0.7948812246322632, "rewards/format_reward_step": 0.9765625, "step": 185 }, { "adv/mean_abs_final_conf": 0.5315274000167847, "adv/mean_abs_reasoning": 0.4281071424484253, "adv/mean_abs_step_conf": 0.7454137206077576, "adv/ratio_final_to_reasoning": 1.2415756415015162, "adv/ratio_step_to_reasoning": 1.7411849667926498, "adv/std_final_conf": 0.7777411937713623, "adv/std_reasoning": 0.7204946279525757, "adv/std_step_conf": 0.9312522411346436, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.765589455372675, "calib/avg_num_step_conf": 6.1953125, "calib/ece": 0.22940711462450591, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.47976916900843036, "calib/mean_conf": 0.6343873517786561, "calib/mu_c": 0.8126415094339623, "calib/mu_w": 0.33287234042553193, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11766798418972331, "calib/std_conf": 0.46896248296948856, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41842163355408385, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.15374516296584861, "calib/step_q_w": 0.26467647058823524, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 516.51171875, "completions/mean_terminated_length": 520.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.1984, "grad_norm": 0.031362757086753845, "kl": 0.07161712646484375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0144, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03271438926458359, "mask/share_reasoning": 0.8355770707130432, "mask/share_step_conf": 0.12389606237411499, "num_tokens": 44650734.0, "reward": 0.9942600131034851, "reward_std": 0.15620407462120056, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.9050840139389038, "rewards/final_brier_reward_step": 0.7615609169006348, "rewards/format_reward_step": 0.98828125, "step": 186 }, { "adv/mean_abs_final_conf": 0.7074639201164246, "adv/mean_abs_reasoning": 0.5609918832778931, "adv/mean_abs_step_conf": 0.7377896308898926, "adv/ratio_final_to_reasoning": 1.2610947523566487, "adv/ratio_step_to_reasoning": 1.3151520599174533, "adv/std_final_conf": 0.878374457359314, "adv/std_reasoning": 0.7928860783576965, "adv/std_step_conf": 0.9338283538818359, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6919585826665771, "calib/avg_num_step_conf": 6.58203125, "calib/ece": 0.326178861788618, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6260162601626016, "calib/gap": 0.28398171182680026, "calib/mean_conf": 0.655040650406504, "calib/mu_c": 0.7785611510791367, "calib/mu_w": 0.49457943925233644, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.20808943089430904, "calib/std_conf": 0.4556517247662998, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.3989940119760479, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.13965283550545965, "calib/step_q_w": 0.25934117647058824, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 573.05078125, "completions/mean_terminated_length": 573.05078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.19946666666666665, "grad_norm": 0.047245342284440994, "kl": 0.06989288330078125, "learning_rate": 3.611111111111111e-07, "loss": 0.0154, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03235046565532684, "mask/share_reasoning": 0.8461363315582275, "mask/share_step_conf": 0.1215132549405098, "num_tokens": 44898979.0, "reward": 0.8949373960494995, "reward_std": 0.23922453820705414, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.8468563556671143, "rewards/final_brier_reward_step": 0.6437996029853821, "rewards/format_reward_step": 0.953125, "step": 187 }, { "adv/mean_abs_final_conf": 0.5643256902694702, "adv/mean_abs_reasoning": 0.47549188137054443, "adv/mean_abs_step_conf": 0.7558174729347229, "adv/ratio_final_to_reasoning": 1.186825080257677, "adv/ratio_step_to_reasoning": 1.5895486390980553, "adv/std_final_conf": 0.7799937129020691, "adv/std_reasoning": 0.7574599981307983, "adv/std_step_conf": 0.9330994486808777, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6968869290509412, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.2914859437751004, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6987951807228916, "calib/gap": 0.32033960773989734, "calib/mean_conf": 0.7150200803212851, "calib/mu_c": 0.8526760563380282, "calib/mu_w": 0.5323364485981309, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21811244979919678, "calib/std_conf": 0.4348300914519625, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43332065906210393, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.10563164337023517, "calib/step_q_w": 0.32768901569186876, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 534.078125, "completions/mean_terminated_length": 536.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.20053333333333334, "grad_norm": 0.038735099136829376, "kl": 0.07011795043945312, "learning_rate": 3.3333333333333335e-07, "loss": 0.0616, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033644575625658035, "mask/share_reasoning": 0.8366734981536865, "mask/share_step_conf": 0.12577570974826813, "num_tokens": 45139775.0, "reward": 0.9236536026000977, "reward_std": 0.19174334406852722, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8591135144233704, "rewards/final_brier_reward_step": 0.6827249526977539, "rewards/format_reward_step": 0.97265625, "step": 188 }, { "adv/mean_abs_final_conf": 0.6418702602386475, "adv/mean_abs_reasoning": 0.47118186950683594, "adv/mean_abs_step_conf": 0.7607347965240479, "adv/ratio_final_to_reasoning": 1.3622558544335823, "adv/ratio_step_to_reasoning": 1.6145247636974518, "adv/std_final_conf": 0.8555505275726318, "adv/std_reasoning": 0.7392024993896484, "adv/std_step_conf": 0.9324488639831543, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7302891744933266, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.2803529411764706, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5647058823529412, "calib/gap": 0.35672268907563026, "calib/mean_conf": 0.6170588235294119, "calib/mu_c": 0.7835294117647059, "calib/mu_w": 0.42680672268907566, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18203921568627449, "calib/std_conf": 0.46179655191796504, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.38990264255910995, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.1266883568448242, "calib/step_q_w": 0.26321428571428573, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 488.4453125, "completions/mean_terminated_length": 490.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.2016, "grad_norm": 0.04739164561033249, "kl": 0.07581329345703125, "learning_rate": 3.055555555555556e-07, "loss": -0.0497, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0347851887345314, "mask/share_reasoning": 0.8378597497940063, "mask/share_step_conf": 0.12344881892204285, "num_tokens": 45372585.0, "reward": 0.9458571672439575, "reward_std": 0.20587018132209778, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.8852865099906921, "rewards/final_brier_reward_step": 0.7017402648925781, "rewards/format_reward_step": 0.9921875, "step": 189 }, { "adv/mean_abs_final_conf": 0.6198446750640869, "adv/mean_abs_reasoning": 0.4780905246734619, "adv/mean_abs_step_conf": 0.7499421834945679, "adv/ratio_final_to_reasoning": 1.2965006480466095, "adv/ratio_step_to_reasoning": 1.568619633293887, "adv/std_final_conf": 0.8346047401428223, "adv/std_reasoning": 0.7392293810844421, "adv/std_step_conf": 0.9328544735908508, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7546182266009852, "calib/avg_num_step_conf": 5.98046875, "calib/ece": 0.25703124999999993, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6328125, "calib/gap": 0.4335714285714287, "calib/mean_conf": 0.6471093749999999, "calib/mu_c": 0.8435714285714287, "calib/mu_w": 0.41000000000000003, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17863281249999996, "calib/std_conf": 0.4635092912090429, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39644417475728155, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.11338901209815849, "calib/step_q_w": 0.28305516265912306, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 529.01953125, "completions/mean_terminated_length": 531.0941772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.20266666666666666, "grad_norm": 0.028230194002389908, "kl": 0.06874847412109375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0593, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0311330147087574, "mask/share_reasoning": 0.8457903265953064, "mask/share_step_conf": 0.1191704124212265, "num_tokens": 45613622.0, "reward": 0.9775964021682739, "reward_std": 0.17072449624538422, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.9036279320716858, "rewards/final_brier_reward_step": 0.742189884185791, "rewards/format_reward_step": 1.0, "step": 190 }, { "adv/mean_abs_final_conf": 0.614532470703125, "adv/mean_abs_reasoning": 0.43729501962661743, "adv/mean_abs_step_conf": 0.7394310235977173, "adv/ratio_final_to_reasoning": 1.405304069613784, "adv/ratio_step_to_reasoning": 1.6909202950197728, "adv/std_final_conf": 0.8193501830101013, "adv/std_reasoning": 0.7205649018287659, "adv/std_step_conf": 0.9335800409317017, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6994230528032108, "calib/avg_num_step_conf": 6.80078125, "calib/ece": 0.3281818181818182, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.30981500062711653, "calib/mean_conf": 0.6575889328063241, "calib/mu_c": 0.8216806722689076, "calib/mu_w": 0.5118656716417911, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25770750988142294, "calib/std_conf": 0.45239821753549814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42917261055634803, "calib/step_q_c_n": 701.0, "calib/step_q_gap": 0.12468222594096345, "calib/step_q_w": 0.3044903846153846, "calib/step_q_w_n": 1040.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 520.515625, "completions/mean_terminated_length": 520.515625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.20373333333333332, "grad_norm": 0.04467106983065605, "kl": 0.07276153564453125, "learning_rate": 2.5000000000000004e-07, "loss": 0.0413, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03575979173183441, "mask/share_reasoning": 0.82267165184021, "mask/share_step_conf": 0.14156854152679443, "num_tokens": 45851042.0, "reward": 0.9081317186355591, "reward_std": 0.185234934091568, "rewards/accuracy_reward_step": 0.46484375, "rewards/asymmetric_l2_reward": 0.8679160475730896, "rewards/final_brier_reward_step": 0.6577222943305969, "rewards/format_reward_step": 0.98828125, "step": 191 }, { "adv/mean_abs_final_conf": 0.6072635650634766, "adv/mean_abs_reasoning": 0.47368013858795166, "adv/mean_abs_step_conf": 0.7371071577072144, "adv/ratio_final_to_reasoning": 1.2820118801555398, "adv/ratio_step_to_reasoning": 1.5561284876848394, "adv/std_final_conf": 0.7935887575149536, "adv/std_reasoning": 0.7393070459365845, "adv/std_step_conf": 0.9304554462432861, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.797986798679868, "calib/avg_num_step_conf": 5.640625, "calib/ece": 0.19027888446215144, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5976095617529881, "calib/gap": 0.5394666666666668, "calib/mean_conf": 0.6223904382470119, "calib/mu_c": 0.8394666666666667, "calib/mu_w": 0.3, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10752988047808769, "calib/std_conf": 0.4627484397319022, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40863046044864226, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.12956915391597895, "calib/step_q_w": 0.2790613065326633, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 535.84765625, "completions/mean_terminated_length": 537.9490356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.2048, "grad_norm": 0.047735992819070816, "kl": 0.07106781005859375, "learning_rate": 2.2222222222222224e-07, "loss": 0.0159, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03571078181266785, "mask/share_reasoning": 0.8375634551048279, "mask/share_step_conf": 0.12281954288482666, "num_tokens": 46093195.0, "reward": 0.9984359741210938, "reward_std": 0.19008705019950867, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.8958485126495361, "rewards/final_brier_reward_step": 0.7885234355926514, "rewards/format_reward_step": 0.9765625, "step": 192 }, { "adv/mean_abs_final_conf": 0.663608729839325, "adv/mean_abs_reasoning": 0.5208760499954224, "adv/mean_abs_step_conf": 0.7138371467590332, "adv/ratio_final_to_reasoning": 1.274024270928097, "adv/ratio_step_to_reasoning": 1.3704549225584601, "adv/std_final_conf": 0.8608205318450928, "adv/std_reasoning": 0.7394456267356873, "adv/std_step_conf": 0.9336157441139221, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7956674862117085, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.22696000000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.592, "calib/gap": 0.4576955279420558, "calib/mean_conf": 0.6274400000000001, "calib/mu_c": 0.8123489932885906, "calib/mu_w": 0.35465346534653475, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12920000000000015, "calib/std_conf": 0.45731547797991706, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4241108247422681, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.1588341715708781, "calib/step_q_w": 0.26527665317139, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 552.3515625, "completions/mean_terminated_length": 552.3515625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.20586666666666667, "grad_norm": 0.04746817424893379, "kl": 0.068115234375, "learning_rate": 1.9444444444444447e-07, "loss": -0.004, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031759485602378845, "mask/share_reasoning": 0.846390426158905, "mask/share_step_conf": 0.12185005843639374, "num_tokens": 46340309.0, "reward": 0.9673618674278259, "reward_std": 0.2169458270072937, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.8808885812759399, "rewards/final_brier_reward_step": 0.7436789274215698, "rewards/format_reward_step": 0.96875, "step": 193 }, { "adv/mean_abs_final_conf": 0.5674813985824585, "adv/mean_abs_reasoning": 0.3744353652000427, "adv/mean_abs_step_conf": 0.7580137252807617, "adv/ratio_final_to_reasoning": 1.5155657059243872, "adv/ratio_step_to_reasoning": 2.0244180858178065, "adv/std_final_conf": 0.8004591464996338, "adv/std_reasoning": 0.6403605937957764, "adv/std_step_conf": 0.933774471282959, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8198921359588611, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.21711462450592878, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6205533596837944, "calib/gap": 0.5331562774363476, "calib/mean_conf": 0.6388537549407114, "calib/mu_c": 0.8896268656716418, "calib/mu_w": 0.35647058823529415, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1631620553359683, "calib/std_conf": 0.4687978385135658, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44545193687230994, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.17214248128491738, "calib/step_q_w": 0.27330945558739256, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 478.26171875, "completions/mean_terminated_length": 478.26171875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.20693333333333333, "grad_norm": 0.03423633426427841, "kl": 0.06972503662109375, "learning_rate": 1.6666666666666668e-07, "loss": -0.0187, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03547438606619835, "mask/share_reasoning": 0.8433297276496887, "mask/share_step_conf": 0.12119589745998383, "num_tokens": 46568688.0, "reward": 0.9822894334793091, "reward_std": 0.18843895196914673, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.8912980556488037, "rewards/final_brier_reward_step": 0.7717183232307434, "rewards/format_reward_step": 0.984375, "step": 194 }, { "adv/mean_abs_final_conf": 0.5949089527130127, "adv/mean_abs_reasoning": 0.3306322693824768, "adv/mean_abs_step_conf": 0.7501680850982666, "adv/ratio_final_to_reasoning": 1.799306987863364, "adv/ratio_step_to_reasoning": 2.268889502223599, "adv/std_final_conf": 0.8255235552787781, "adv/std_reasoning": 0.6185620427131653, "adv/std_step_conf": 0.9332430362701416, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7805431330611187, "calib/avg_num_step_conf": 5.890625, "calib/ece": 0.20566800000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.592, "calib/gap": 0.5285356795644565, "calib/mean_conf": 0.623028, "calib/mu_c": 0.8576978417266186, "calib/mu_w": 0.32916216216216215, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13634800000000002, "calib/std_conf": 0.4616225159326611, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4317737789203085, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.16059569672852764, "calib/step_q_w": 0.27117808219178086, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 512.59765625, "completions/mean_terminated_length": 516.6338500976562, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.208, "grad_norm": 0.04133572801947594, "kl": 0.06982421875, "learning_rate": 1.3888888888888888e-07, "loss": -0.0151, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03313131630420685, "mask/share_reasoning": 0.8336950540542603, "mask/share_step_conf": 0.1253610998392105, "num_tokens": 46805897.0, "reward": 0.985278844833374, "reward_std": 0.18139265477657318, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.888818621635437, "rewards/final_brier_reward_step": 0.7778328061103821, "rewards/format_reward_step": 0.9765625, "step": 195 }, { "adv/mean_abs_final_conf": 0.5143544673919678, "adv/mean_abs_reasoning": 0.3996366560459137, "adv/mean_abs_step_conf": 0.7586710453033447, "adv/ratio_final_to_reasoning": 1.2870552778643867, "adv/ratio_step_to_reasoning": 1.8984020455225261, "adv/std_final_conf": 0.7629029750823975, "adv/std_reasoning": 0.6816370487213135, "adv/std_step_conf": 0.9331806898117065, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7817218627077782, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.2220553359683795, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6956521739130435, "calib/gap": 0.4824356046187033, "calib/mean_conf": 0.7189723320158102, "calib/mu_c": 0.9306338028169014, "calib/mu_w": 0.44819819819819817, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18988142292490126, "calib/std_conf": 0.43153981494492955, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4720873124147339, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.14921429654171803, "calib/step_q_w": 0.32287301587301587, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 425.453125, "completions/mean_terminated_length": 428.80316162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.20906666666666668, "grad_norm": 0.03610050305724144, "kl": 0.08242034912109375, "learning_rate": 1.1111111111111112e-07, "loss": -0.0574, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037843845784664154, "mask/share_reasoning": 0.8252567052841187, "mask/share_step_conf": 0.12908688187599182, "num_tokens": 47017357.0, "reward": 0.9776846170425415, "reward_std": 0.17640987038612366, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.8756676912307739, "rewards/final_brier_reward_step": 0.7711077928543091, "rewards/format_reward_step": 0.98828125, "step": 196 }, { "adv/mean_abs_final_conf": 0.5709211826324463, "adv/mean_abs_reasoning": 0.5281293988227844, "adv/mean_abs_step_conf": 0.7545597553253174, "adv/ratio_final_to_reasoning": 1.081025187965385, "adv/ratio_step_to_reasoning": 1.4287402992661509, "adv/std_final_conf": 0.7653646469116211, "adv/std_reasoning": 0.7754925489425659, "adv/std_step_conf": 0.932712972164154, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8252395752395751, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.2161441767068273, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6144578313253012, "calib/gap": 0.5121394522144523, "calib/mean_conf": 0.6553417670682731, "calib/mu_c": 0.8959856060606062, "calib/mu_w": 0.38384615384615384, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1706827309236948, "calib/std_conf": 0.4506214670081801, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43755376344086017, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.16742600502041882, "calib/step_q_w": 0.27012775842044134, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 550.2890625, "completions/mean_terminated_length": 550.2890625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.21013333333333334, "grad_norm": 0.04556097462773323, "kl": 0.06296539306640625, "learning_rate": 8.333333333333334e-08, "loss": 0.0593, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03290029242634773, "mask/share_reasoning": 0.8371220231056213, "mask/share_step_conf": 0.12997770309448242, "num_tokens": 47263287.0, "reward": 0.9729477167129517, "reward_std": 0.1934598684310913, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.8871469497680664, "rewards/final_brier_reward_step": 0.7618734240531921, "rewards/format_reward_step": 0.96875, "step": 197 }, { "adv/mean_abs_final_conf": 0.6123339533805847, "adv/mean_abs_reasoning": 0.5126949548721313, "adv/mean_abs_step_conf": 0.7387211918830872, "adv/ratio_final_to_reasoning": 1.1943436297969887, "adv/ratio_step_to_reasoning": 1.440859101231702, "adv/std_final_conf": 0.8177485466003418, "adv/std_reasoning": 0.7576181292533875, "adv/std_step_conf": 0.931955099105835, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8303197064989518, "calib/avg_num_step_conf": 5.9921875, "calib/ece": 0.16856000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.524, "calib/gap": 0.6000628930817611, "calib/mean_conf": 0.5522400000000001, "calib/mu_c": 0.8066666666666668, "calib/mu_w": 0.20660377358490564, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07240000000000002, "calib/std_conf": 0.47586529858774107, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41482986111111114, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.10320299543946931, "calib/step_q_w": 0.31162686567164183, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 475.43359375, "completions/mean_terminated_length": 481.0711669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.2112, "grad_norm": 0.03035557083785534, "kl": 0.07807159423828125, "learning_rate": 5.555555555555556e-08, "loss": 0.0208, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03742978721857071, "mask/share_reasoning": 0.8121263384819031, "mask/share_step_conf": 0.13872510194778442, "num_tokens": 47490382.0, "reward": 1.0016413927078247, "reward_std": 0.15997019410133362, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.8928694128990173, "rewards/final_brier_reward_step": 0.8026007413864136, "rewards/format_reward_step": 0.9765625, "step": 198 }, { "adv/mean_abs_final_conf": 0.6816428899765015, "adv/mean_abs_reasoning": 0.537695050239563, "adv/mean_abs_step_conf": 0.7122054100036621, "adv/ratio_final_to_reasoning": 1.267712785663183, "adv/ratio_step_to_reasoning": 1.3245526617482313, "adv/std_final_conf": 0.8624335527420044, "adv/std_reasoning": 0.7929316163063049, "adv/std_step_conf": 0.9336603283882141, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7438086548488009, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.27726907630522085, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.642570281124498, "calib/gap": 0.3785968456725758, "calib/mean_conf": 0.6783935742971888, "calib/mu_c": 0.8486861313868614, "calib/mu_w": 0.47008928571428565, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2027309236947791, "calib/std_conf": 0.44654118043262725, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.425260347129506, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.12844216531132419, "calib/step_q_w": 0.2968181818181818, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 547.49609375, "completions/mean_terminated_length": 551.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.21226666666666666, "grad_norm": 0.29570773243904114, "kl": 1.4525260925292969, "learning_rate": 2.777777777777778e-08, "loss": -0.0941, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0359857976436615, "mask/share_reasoning": 0.8317693471908569, "mask/share_step_conf": 0.12443234026432037, "num_tokens": 47734741.0, "reward": 0.940308690071106, "reward_std": 0.24737019836902618, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.877549409866333, "rewards/final_brier_reward_step": 0.7022866606712341, "rewards/format_reward_step": 0.96875, "step": 199 }, { "adv/mean_abs_final_conf": 0.5579714775085449, "adv/mean_abs_reasoning": 0.4412845969200134, "adv/mean_abs_step_conf": 0.7684429883956909, "adv/ratio_final_to_reasoning": 1.2644254555970418, "adv/ratio_step_to_reasoning": 1.7413773192155577, "adv/std_final_conf": 0.7886030077934265, "adv/std_reasoning": 0.7204948663711548, "adv/std_step_conf": 0.9336580634117126, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7859723058398554, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.2114399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.64, "calib/gap": 0.52675630476955, "calib/mean_conf": 0.6500800000000001, "calib/mu_c": 0.8586754966887419, "calib/mu_w": 0.3319191919191919, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12875999999999993, "calib/std_conf": 0.46648128965693786, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4678467635402906, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.20974045124793178, "calib/step_q_w": 0.2581063122923588, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 496.10546875, "completions/mean_terminated_length": 500.0118103027344, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.21333333333333335, "grad_norm": 0.029257260262966156, "kl": 0.0724029541015625, "learning_rate": 0.0, "loss": -0.0295, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03522798418998718, "mask/share_reasoning": 0.8422371745109558, "mask/share_step_conf": 0.1147223487496376, "num_tokens": 47969792.0, "reward": 0.9783412218093872, "reward_std": 0.192615807056427, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.8736051321029663, "rewards/final_brier_reward_step": 0.7705773711204529, "rewards/format_reward_step": 0.97265625, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.003022296619601548, "train_runtime": 14229.7566, "train_samples_per_second": 3.598, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47969792, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }