{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7557821869850159, "adv/mean_abs_reasoning": 0.28040462732315063, "adv/mean_abs_step_conf": 0.7646071910858154, "adv/ratio_final_to_reasoning": 2.69532708571895, "adv/ratio_step_to_reasoning": 2.7267994768312023, "adv/std_final_conf": 0.9257818460464478, "adv/std_reasoning": 0.5727222561836243, "adv/std_step_conf": 0.9352434873580933, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.59765625, "calib/ece": 0.23243902439024394, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.008130081300813009, "calib/gap": -0.04614489795918364, "calib/mean_conf": 0.6646341463414636, "calib/mu_c": 0.6552551020408164, "calib/mu_w": 0.7014, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05016260162601624, "calib/std_conf": 0.05917169015101882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583372, "calib/step_q_c_n": 2500.0, "calib/step_q_gap": -0.0778082748585287, "calib/step_q_w": 0.6611802748585287, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 750.2265625, "completions/mean_terminated_length": 780.7235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0010666666666666667, "grad_norm": 0.3157978057861328, "kl": 0.00047022104263305664, "learning_rate": 0.0, "loss": -0.094, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01929234340786934, "mask/share_reasoning": 0.7498296499252319, "mask/share_step_conf": 0.19181546568870544, "num_tokens": 299642.0, "reward": 0.7782012224197388, "reward_std": 0.1634182333946228, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7708241939544678, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.4402656555175781, "step": 1 }, { "adv/mean_abs_final_conf": 0.7929245233535767, "adv/mean_abs_reasoning": 0.4050842523574829, "adv/mean_abs_step_conf": 0.768415093421936, "adv/ratio_final_to_reasoning": 1.9574311238685933, "adv/ratio_step_to_reasoning": 1.8969265997134275, "adv/std_final_conf": 0.9301473498344421, "adv/std_reasoning": 0.6612725853919983, "adv/std_step_conf": 0.9350779056549072, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.078125, "calib/ece": 0.04704724409448811, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.008169981916817282, "calib/mean_conf": 0.6691732283464566, "calib/mu_c": 0.6717142857142857, "calib/mu_w": 0.6635443037974684, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013622047244094477, "calib/std_conf": 0.060200661111313364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5911686697057605, "calib/step_q_c_n": 2413.0, "calib/step_q_gap": -0.011375410898773475, "calib/step_q_w": 0.602544080604534, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 867.8828125, "completions/mean_terminated_length": 871.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.0021333333333333334, "grad_norm": 0.5620657801628113, "kl": 0.0006206929683685303, "learning_rate": 2.5000000000000004e-07, "loss": -0.047, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01878987066447735, "mask/share_reasoning": 0.7856365442276001, "mask/share_step_conf": 0.1916673481464386, "num_tokens": 625108.0, "reward": 0.7644762992858887, "reward_std": 0.18184059858322144, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7790628671646118, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.4147334694862366, "step": 2 }, { "adv/mean_abs_final_conf": 0.7465832233428955, "adv/mean_abs_reasoning": 0.32673323154449463, "adv/mean_abs_step_conf": 0.7574703693389893, "adv/ratio_final_to_reasoning": 2.2849932338187204, "adv/ratio_step_to_reasoning": 2.3183144419022366, "adv/std_final_conf": 0.9279414415359497, "adv/std_reasoning": 0.6184476613998413, "adv/std_step_conf": 0.9352378845214844, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.43359375, "calib/ece": 0.17269531249999992, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.02415453527435607, "calib/mean_conf": 0.6649609375000001, "calib/mu_c": 0.6605263157894737, "calib/mu_w": 0.6846808510638298, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010625000000000002, "calib/std_conf": 0.056692688894786895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5873742196107234, "calib/step_q_c_n": 2723.0, "calib/step_q_gap": -0.01493024966301959, "calib/step_q_w": 0.602304469273743, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 795.03125, "completions/mean_terminated_length": 801.2913208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0032, "grad_norm": 0.33921700716018677, "kl": 0.0004811286926269531, "learning_rate": 5.000000000000001e-07, "loss": -0.0258, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019342608749866486, "mask/share_reasoning": 0.77967369556427, "mask/share_step_conf": 0.1931712031364441, "num_tokens": 933892.0, "reward": 0.8321274518966675, "reward_std": 0.2148733288049698, "rewards/accuracy_reward_step": 0.81640625, "rewards/final_brier_reward_step": 0.8167222738265991, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.4842514991760254, "step": 3 }, { "adv/mean_abs_final_conf": 0.7097917199134827, "adv/mean_abs_reasoning": 0.24592944979667664, "adv/mean_abs_step_conf": 0.7488648891448975, "adv/ratio_final_to_reasoning": 2.8861599149687294, "adv/ratio_step_to_reasoning": 3.045039501222912, "adv/std_final_conf": 0.9128405451774597, "adv/std_reasoning": 0.5481777787208557, "adv/std_step_conf": 0.9354363083839417, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.29296875, "calib/ece": 0.11666666666666665, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0196078431372549, "calib/gap": -0.02782347282347286, "calib/mean_conf": 0.6761960784313725, "calib/mu_c": 0.668994708994709, "calib/mu_w": 0.6968181818181819, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025843137254901963, "calib/std_conf": 0.06795067980956544, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5909930178432894, "calib/step_q_c_n": 2578.0, "calib/step_q_gap": -0.021513920177061996, "calib/step_q_w": 0.6125069380203514, "calib/step_q_w_n": 1081.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 868.8359375, "completions/mean_terminated_length": 875.6771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.004266666666666667, "grad_norm": 0.2606043219566345, "kl": 0.0006146430969238281, "learning_rate": 7.5e-07, "loss": 0.0218, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018096409738063812, "mask/share_reasoning": 0.7837621569633484, "mask/share_step_conf": 0.190328910946846, "num_tokens": 1262482.0, "reward": 0.7609648704528809, "reward_std": 0.15543901920318604, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7855706810951233, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.38948407769203186, "step": 4 }, { "adv/mean_abs_final_conf": 0.7735724449157715, "adv/mean_abs_reasoning": 0.2758006453514099, "adv/mean_abs_step_conf": 0.771937906742096, "adv/ratio_final_to_reasoning": 2.804824636759382, "adv/ratio_step_to_reasoning": 2.7988981162771953, "adv/std_final_conf": 0.9277228713035583, "adv/std_reasoning": 0.5482957363128662, "adv/std_step_conf": 0.9349169731140137, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.0859375, "calib/ece": 0.10850980392156867, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.023529411764705882, "calib/gap": -0.046050398885652766, "calib/mean_conf": 0.6789411764705883, "calib/mu_c": 0.6597986577181209, "calib/mu_w": 0.7058490566037736, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1015686274509804, "calib/std_conf": 0.0810425721493136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5856219445953286, "calib/step_q_c_n": 1841.0, "calib/step_q_gap": -0.03614009506472804, "calib/step_q_w": 0.6217620396600566, "calib/step_q_w_n": 1765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 872.9921875, "completions/mean_terminated_length": 879.8661499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.005333333333333333, "grad_norm": 1.078272819519043, "kl": 0.001311957836151123, "learning_rate": 1.0000000000000002e-06, "loss": 0.0172, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019209835678339005, "mask/share_reasoning": 0.7786589860916138, "mask/share_step_conf": 0.19431866705417633, "num_tokens": 1592656.0, "reward": 0.5782305002212524, "reward_std": 0.2045661062002182, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7164065837860107, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.12442938983440399, "step": 5 }, { "adv/mean_abs_final_conf": 0.7472907304763794, "adv/mean_abs_reasoning": 0.18372581899166107, "adv/mean_abs_step_conf": 0.7483274340629578, "adv/ratio_final_to_reasoning": 4.067423591184522, "adv/ratio_step_to_reasoning": 4.073066258025078, "adv/std_final_conf": 0.9264699220657349, "adv/std_reasoning": 0.4374311864376068, "adv/std_step_conf": 0.9353809356689453, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.9296875, "calib/ece": 0.0898617187500001, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.00648107380744134, "calib/mean_conf": 0.65548671875, "calib/mu_c": 0.6532335329341318, "calib/mu_w": 0.6597146067415731, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.046502343749999994, "calib/std_conf": 0.06676177666605644, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.580767754318618, "calib/step_q_c_n": 2084.0, "calib/step_q_gap": -0.01605483948236075, "calib/step_q_w": 0.5968225938009788, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 756.47265625, "completions/mean_terminated_length": 762.4291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.0064, "grad_norm": 0.29219964146614075, "kl": 0.0009875297546386719, "learning_rate": 1.25e-06, "loss": 0.0021, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020911913365125656, "mask/share_reasoning": 0.7737997770309448, "mask/share_step_conf": 0.19747580587863922, "num_tokens": 1892265.0, "reward": 0.7232071757316589, "reward_std": 0.14570766687393188, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7624897956848145, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.35423704981803894, "step": 6 }, { "adv/mean_abs_final_conf": 0.7696272134780884, "adv/mean_abs_reasoning": 0.3594735264778137, "adv/mean_abs_step_conf": 0.7798027992248535, "adv/ratio_final_to_reasoning": 2.140984402993551, "adv/ratio_step_to_reasoning": 2.1692913157347125, "adv/std_final_conf": 0.9286594390869141, "adv/std_reasoning": 0.6403165459632874, "adv/std_step_conf": 0.9355884790420532, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 14.8515625, "calib/ece": 0.09394422310756981, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0199203187250996, "calib/gap": -0.004911071587372251, "calib/mean_conf": 0.6777689243027889, "calib/mu_c": 0.676242774566474, "calib/mu_w": 0.6811538461538462, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.041235059760956226, "calib/std_conf": 0.0627507114167244, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5945607553366175, "calib/step_q_c_n": 2436.0, "calib/step_q_gap": -0.028206448177291787, "calib/step_q_w": 0.6227672035139092, "calib/step_q_w_n": 1366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 888.43359375, "completions/mean_terminated_length": 906.1315307617188, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.007466666666666667, "grad_norm": 0.30397337675094604, "kl": 0.000798642635345459, "learning_rate": 1.5e-06, "loss": -0.1192, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01730389893054962, "mask/share_reasoning": 0.776395857334137, "mask/share_step_conf": 0.1867690086364746, "num_tokens": 2227128.0, "reward": 0.6936839818954468, "reward_std": 0.22909244894981384, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.764412522315979, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2917054295539856, "step": 7 }, { "adv/mean_abs_final_conf": 0.7443795204162598, "adv/mean_abs_reasoning": 0.359142541885376, "adv/mean_abs_step_conf": 0.740021824836731, "adv/ratio_final_to_reasoning": 2.0726576041605123, "adv/ratio_step_to_reasoning": 2.060523994043893, "adv/std_final_conf": 0.9140991568565369, "adv/std_reasoning": 0.6403468251228333, "adv/std_step_conf": 0.9353559613227844, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.953125, "calib/ece": 0.14135999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.03475222816399304, "calib/mean_conf": 0.6742400000000001, "calib/mu_c": 0.6624242424242424, "calib/mu_w": 0.6971764705882354, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07780000000000002, "calib/std_conf": 0.06498324707184153, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5856710158434297, "calib/step_q_c_n": 2146.0, "calib/step_q_gap": -0.04366905550020872, "calib/step_q_w": 0.6293400713436385, "calib/step_q_w_n": 1682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 857.8125, "completions/mean_terminated_length": 878.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.008533333333333334, "grad_norm": 0.4903322458267212, "kl": 0.0005944967269897461, "learning_rate": 1.75e-06, "loss": -0.1395, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01849009469151497, "mask/share_reasoning": 0.7680476903915405, "mask/share_step_conf": 0.190024733543396, "num_tokens": 2553240.0, "reward": 0.6751306056976318, "reward_std": 0.2096453160047531, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7378687262535095, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.28817370533943176, "step": 8 }, { "adv/mean_abs_final_conf": 0.7298851013183594, "adv/mean_abs_reasoning": 0.2622373104095459, "adv/mean_abs_step_conf": 0.7591660022735596, "adv/ratio_final_to_reasoning": 2.7832999819074953, "adv/ratio_step_to_reasoning": 2.894958009933603, "adv/std_final_conf": 0.9278061985969543, "adv/std_reasoning": 0.5727177262306213, "adv/std_step_conf": 0.9351189732551575, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.0859375, "calib/ece": 0.10218253968253967, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.027777777777777776, "calib/gap": 0.011294309501330257, "calib/mean_conf": 0.6726587301587301, "calib/mu_c": 0.675392670157068, "calib/mu_w": 0.6640983606557378, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.008452380952380949, "calib/std_conf": 0.07093811999361678, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6009610575205431, "calib/step_q_c_n": 2799.0, "calib/step_q_gap": -0.02568080513232618, "calib/step_q_w": 0.6266418626528693, "calib/step_q_w_n": 1063.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 865.7734375, "completions/mean_terminated_length": 876.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.0096, "grad_norm": 0.5019803047180176, "kl": 0.0006263852119445801, "learning_rate": 2.0000000000000003e-06, "loss": -0.1137, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018472330644726753, "mask/share_reasoning": 0.779818594455719, "mask/share_step_conf": 0.1899903267621994, "num_tokens": 2882414.0, "reward": 0.7639858722686768, "reward_std": 0.1735934615135193, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.7957402467727661, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3861377537250519, "step": 9 }, { "adv/mean_abs_final_conf": 0.7413267493247986, "adv/mean_abs_reasoning": 0.3311035633087158, "adv/mean_abs_step_conf": 0.7383732795715332, "adv/ratio_final_to_reasoning": 2.238957327782054, "adv/ratio_step_to_reasoning": 2.2300372493517546, "adv/std_final_conf": 0.9291539192199707, "adv/std_reasoning": 0.6403902173042297, "adv/std_step_conf": 0.935117781162262, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.2578125, "calib/ece": 0.18198380566801617, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.024613217213114602, "calib/mean_conf": 0.6712955465587044, "calib/mu_c": 0.6649180327868853, "calib/mu_w": 0.6895312499999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.056194331983805676, "calib/std_conf": 0.06283406026960563, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.589368, "calib/step_q_c_n": 2500.0, "calib/step_q_gap": -0.04469352204836419, "calib/step_q_w": 0.6340615220483642, "calib/step_q_w_n": 1406.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 857.03125, "completions/mean_terminated_length": 884.6773681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.010666666666666666, "grad_norm": 0.3038390278816223, "kl": 0.0007747411727905273, "learning_rate": 2.25e-06, "loss": -0.1784, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01796487718820572, "mask/share_reasoning": 0.7671234607696533, "mask/share_step_conf": 0.18366166949272156, "num_tokens": 3208614.0, "reward": 0.7771371006965637, "reward_std": 0.20995953679084778, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7620207071304321, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.4563159942626953, "step": 10 }, { "adv/mean_abs_final_conf": 0.7395666837692261, "adv/mean_abs_reasoning": 0.2475532591342926, "adv/mean_abs_step_conf": 0.7714335918426514, "adv/ratio_final_to_reasoning": 2.987505340691258, "adv/ratio_step_to_reasoning": 3.1162328241623527, "adv/std_final_conf": 0.9268957376480103, "adv/std_reasoning": 0.5482560396194458, "adv/std_step_conf": 0.9356309771537781, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.7265625, "calib/ece": 0.12937007874015752, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.040216941411546414, "calib/mean_conf": 0.6769291338582677, "calib/mu_c": 0.6641040462427745, "calib/mu_w": 0.7043209876543209, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06259842519685044, "calib/std_conf": 0.06555651298927354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5852804141501294, "calib/step_q_c_n": 2318.0, "calib/step_q_gap": -0.036255398522046844, "calib/step_q_w": 0.6215358126721763, "calib/step_q_w_n": 1452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 871.37890625, "completions/mean_terminated_length": 881.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.011733333333333333, "grad_norm": 0.25886476039886475, "kl": 0.0006725192070007324, "learning_rate": 2.5e-06, "loss": -0.0581, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01826527714729309, "mask/share_reasoning": 0.7783842086791992, "mask/share_step_conf": 0.1916317641735077, "num_tokens": 3536167.0, "reward": 0.6843187808990479, "reward_std": 0.19306161999702454, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7550671696662903, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.27997660636901855, "step": 11 }, { "adv/mean_abs_final_conf": 0.7612889409065247, "adv/mean_abs_reasoning": 0.41153576970100403, "adv/mean_abs_step_conf": 0.7721890211105347, "adv/ratio_final_to_reasoning": 1.8498730777633965, "adv/ratio_step_to_reasoning": 1.8763594272049755, "adv/std_final_conf": 0.9294129014015198, "adv/std_reasoning": 0.6818270087242126, "adv/std_step_conf": 0.9352567195892334, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 15.1484375, "calib/ece": 0.11770833333333333, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.008333333333333333, "calib/gap": -0.01616477272727279, "calib/mean_conf": 0.6647083333333332, "calib/mu_c": 0.6603977272727272, "calib/mu_w": 0.6765625, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024541666666666663, "calib/std_conf": 0.06142677155678043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5832122286220647, "calib/step_q_c_n": 2257.0, "calib/step_q_gap": -0.06902657458583183, "calib/step_q_w": 0.6522388032078965, "calib/step_q_w_n": 1621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 747.890625, "completions/mean_terminated_length": 794.4398803710938, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.0128, "grad_norm": 0.27160966396331787, "kl": 0.0008394122123718262, "learning_rate": 2.7500000000000004e-06, "loss": -0.1756, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.0197913758456707, "mask/share_reasoning": 0.7317686080932617, "mask/share_step_conf": 0.18984632194042206, "num_tokens": 3831803.0, "reward": 0.7235974669456482, "reward_std": 0.21748779714107513, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7402870655059814, "rewards/format_reward_step": 0.9375, "rewards/step_correlation_reward": 0.38190776109695435, "step": 12 }, { "adv/mean_abs_final_conf": 0.7387239336967468, "adv/mean_abs_reasoning": 0.27716708183288574, "adv/mean_abs_step_conf": 0.7305924892425537, "adv/ratio_final_to_reasoning": 2.665265762483839, "adv/ratio_step_to_reasoning": 2.6359280633587465, "adv/std_final_conf": 0.9279195070266724, "adv/std_reasoning": 0.5726190209388733, "adv/std_step_conf": 0.9355447292327881, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.41796875, "calib/ece": 0.11050980392156862, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.0006725146198829357, "calib/mean_conf": 0.6659607843137255, "calib/mu_c": 0.666111111111111, "calib/mu_w": 0.665438596491228, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.05874302988584404, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5896403978576894, "calib/step_q_c_n": 2614.0, "calib/step_q_gap": -0.009105034541823431, "calib/step_q_w": 0.5987454323995128, "calib/step_q_w_n": 821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 833.36328125, "completions/mean_terminated_length": 836.6314086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.013866666666666666, "grad_norm": 0.334152489900589, "kl": 0.0010883808135986328, "learning_rate": 3e-06, "loss": 0.0129, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01961558125913143, "mask/share_reasoning": 0.7841991782188416, "mask/share_step_conf": 0.19227902591228485, "num_tokens": 4149736.0, "reward": 0.827056884765625, "reward_std": 0.16083455085754395, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.8078383207321167, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.49236929416656494, "step": 13 }, { "adv/mean_abs_final_conf": 0.764909565448761, "adv/mean_abs_reasoning": 0.4123122990131378, "adv/mean_abs_step_conf": 0.7722538113594055, "adv/ratio_final_to_reasoning": 1.8551703824493193, "adv/ratio_step_to_reasoning": 1.8729827201560112, "adv/std_final_conf": 0.9291587471961975, "adv/std_reasoning": 0.6817663311958313, "adv/std_step_conf": 0.9356257319450378, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.640625, "calib/ece": 0.08677419354838706, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.012096774193548387, "calib/gap": -0.009504283965728222, "calib/mean_conf": 0.6731451612903225, "calib/mu_c": 0.6702325581395349, "calib/mu_w": 0.6797368421052631, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03318548387096776, "calib/std_conf": 0.06165548055962667, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5928045685279189, "calib/step_q_c_n": 2364.0, "calib/step_q_gap": -0.03080814823508693, "calib/step_q_w": 0.6236127167630058, "calib/step_q_w_n": 1384.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 912.83984375, "completions/mean_terminated_length": 934.748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.014933333333333333, "grad_norm": 0.32500895857810974, "kl": 0.001935720443725586, "learning_rate": 3.2500000000000002e-06, "loss": -0.1, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01796538010239601, "mask/share_reasoning": 0.7767523527145386, "mask/share_step_conf": 0.18184472620487213, "num_tokens": 4488823.0, "reward": 0.7065720558166504, "reward_std": 0.2284417450428009, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7548531293869019, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.3301659822463989, "step": 14 }, { "adv/mean_abs_final_conf": 0.7412395477294922, "adv/mean_abs_reasoning": 0.4259348511695862, "adv/mean_abs_step_conf": 0.7584189176559448, "adv/ratio_final_to_reasoning": 1.7402650797278085, "adv/ratio_step_to_reasoning": 1.7805984074169594, "adv/std_final_conf": 0.9293286204338074, "adv/std_reasoning": 0.7205248475074768, "adv/std_step_conf": 0.9360866546630859, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.19921875, "calib/ece": 0.09861660079051382, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": -0.025413377648524915, "calib/mean_conf": 0.6684980237154151, "calib/mu_c": 0.6597590361445784, "calib/mu_w": 0.6851724137931033, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05549407114624505, "calib/std_conf": 0.06361411734000602, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5882619047619048, "calib/step_q_c_n": 2100.0, "calib/step_q_gap": -0.026754381883046352, "calib/step_q_w": 0.6150162866449511, "calib/step_q_w_n": 1535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 820.140625, "completions/mean_terminated_length": 829.8656616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.016, "grad_norm": 0.40916433930397034, "kl": 0.004666328430175781, "learning_rate": 3.5e-06, "loss": -0.0619, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019354475662112236, "mask/share_reasoning": 0.7726200819015503, "mask/share_step_conf": 0.19630667567253113, "num_tokens": 4806659.0, "reward": 0.7369135618209839, "reward_std": 0.24567002058029175, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7498167753219604, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.39666664600372314, "step": 15 }, { "adv/mean_abs_final_conf": 0.7580640912055969, "adv/mean_abs_reasoning": 0.30199116468429565, "adv/mean_abs_step_conf": 0.7372256517410278, "adv/ratio_final_to_reasoning": 2.5102194363801473, "adv/ratio_step_to_reasoning": 2.441215962432975, "adv/std_final_conf": 0.9269432425498962, "adv/std_reasoning": 0.5960419178009033, "adv/std_step_conf": 0.9359452128410339, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.15625, "calib/ece": 0.12853174603174605, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.019044067796610387, "calib/mean_conf": 0.6870238095238095, "calib/mu_c": 0.6813559322033897, "calib/mu_w": 0.7004000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05658730158730163, "calib/std_conf": 0.059116103718198997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5930035615354174, "calib/step_q_c_n": 2527.0, "calib/step_q_gap": -0.024141006092076922, "calib/step_q_w": 0.6171445676274944, "calib/step_q_w_n": 1353.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 995.6875, "completions/mean_terminated_length": 1015.52197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.017066666666666667, "grad_norm": 0.2688722014427185, "kl": 0.003340482711791992, "learning_rate": 3.7500000000000005e-06, "loss": -0.0806, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.015878742560744286, "mask/share_reasoning": 0.7861171960830688, "mask/share_step_conf": 0.17847280204296112, "num_tokens": 5170403.0, "reward": 0.7015179991722107, "reward_std": 0.21285340189933777, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7670894861221313, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3007902503013611, "step": 16 }, { "adv/mean_abs_final_conf": 0.7609270811080933, "adv/mean_abs_reasoning": 0.2718398869037628, "adv/mean_abs_step_conf": 0.7644103169441223, "adv/ratio_final_to_reasoning": 2.7991737701740504, "adv/ratio_step_to_reasoning": 2.8119873269912743, "adv/std_final_conf": 0.9288200736045837, "adv/std_reasoning": 0.5483255982398987, "adv/std_step_conf": 0.93514484167099, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.578125, "calib/ece": 0.12751968503937006, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.007840983760442954, "calib/mean_conf": 0.6712992125984253, "calib/mu_c": 0.6729353233830845, "calib/mu_w": 0.6650943396226415, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003740157480314958, "calib/std_conf": 0.062237520010181915, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.594040331292762, "calib/step_q_c_n": 2777.0, "calib/step_q_gap": -0.00634593480165857, "calib/step_q_w": 0.6003862660944206, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 804.3828125, "completions/mean_terminated_length": 813.9209594726562, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.018133333333333335, "grad_norm": 0.2445593923330307, "kl": 0.007335186004638672, "learning_rate": 4.000000000000001e-06, "loss": -0.0116, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020442228764295578, "mask/share_reasoning": 0.7655640840530396, "mask/share_step_conf": 0.20227494835853577, "num_tokens": 5479853.0, "reward": 0.8299559354782104, "reward_std": 0.1681516319513321, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.8127847909927368, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.49165841937065125, "step": 17 }, { "adv/mean_abs_final_conf": 0.73135906457901, "adv/mean_abs_reasoning": 0.3147658407688141, "adv/mean_abs_step_conf": 0.7463362812995911, "adv/ratio_final_to_reasoning": 2.3235020127745405, "adv/ratio_step_to_reasoning": 2.3710841032707624, "adv/std_final_conf": 0.9284005165100098, "adv/std_reasoning": 0.6185328960418701, "adv/std_step_conf": 0.9355065822601318, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.140625, "calib/ece": 0.12024390243902439, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.012195121951219513, "calib/gap": -0.033293650793650875, "calib/mean_conf": 0.6708130081300813, "calib/mu_c": 0.6594444444444444, "calib/mu_w": 0.6927380952380953, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.066260162601626, "calib/std_conf": 0.07266241685120631, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5880245746691871, "calib/step_q_c_n": 2116.0, "calib/step_q_gap": -0.0349936071489948, "calib/step_q_w": 0.6230181818181819, "calib/step_q_w_n": 1760.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 849.63671875, "completions/mean_terminated_length": 880.5951538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.0192, "grad_norm": 0.27101266384124756, "kl": 0.00897979736328125, "learning_rate": 4.25e-06, "loss": -0.0554, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018169566988945007, "mask/share_reasoning": 0.7656527757644653, "mask/share_step_conf": 0.18102139234542847, "num_tokens": 5808080.0, "reward": 0.6677666306495667, "reward_std": 0.19289115071296692, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7252484560012817, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.29153478145599365, "step": 18 }, { "adv/mean_abs_final_conf": 0.7627384662628174, "adv/mean_abs_reasoning": 0.2930169403553009, "adv/mean_abs_step_conf": 0.7524563074111938, "adv/ratio_final_to_reasoning": 2.6030524560728487, "adv/ratio_step_to_reasoning": 2.5679617925803018, "adv/std_final_conf": 0.9256052374839783, "adv/std_reasoning": 0.5726289749145508, "adv/std_step_conf": 0.9351460337638855, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.04296875, "calib/ece": 0.08541176470588227, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007218903986859537, "calib/mean_conf": 0.6444705882352941, "calib/mu_c": 0.6423756906077349, "calib/mu_w": 0.6495945945945945, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010039215686274503, "calib/std_conf": 0.033984052165902216, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.572906704172527, "calib/step_q_c_n": 2133.0, "calib/step_q_gap": -0.0059038221432624605, "calib/step_q_w": 0.5788105263157894, "calib/step_q_w_n": 950.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 724.125, "completions/mean_terminated_length": 726.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.020266666666666665, "grad_norm": 0.5130786895751953, "kl": 0.014491081237792969, "learning_rate": 4.5e-06, "loss": 0.0017, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02103264629840851, "mask/share_reasoning": 0.7846888899803162, "mask/share_step_conf": 0.19037219882011414, "num_tokens": 6098216.0, "reward": 0.7534011006355286, "reward_std": 0.15472280979156494, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7825515270233154, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.38362565636634827, "step": 19 }, { "adv/mean_abs_final_conf": 0.7196226119995117, "adv/mean_abs_reasoning": 0.1906130015850067, "adv/mean_abs_step_conf": 0.760223388671875, "adv/ratio_final_to_reasoning": 3.7753070672809548, "adv/ratio_step_to_reasoning": 3.98830815500716, "adv/std_final_conf": 0.9120534658432007, "adv/std_reasoning": 0.4959540367126465, "adv/std_step_conf": 0.9357298612594604, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.82421875, "calib/ece": 0.19436507936507932, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.05569444444444449, "calib/mean_conf": 0.653968253968254, "calib/mu_c": 0.6380555555555556, "calib/mu_w": 0.6937500000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06702380952380953, "calib/std_conf": 0.07203481703457948, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5779971181556196, "calib/step_q_c_n": 2082.0, "calib/step_q_gap": -0.05537131012166252, "calib/step_q_w": 0.6333684282772821, "calib/step_q_w_n": 1457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 757.83984375, "completions/mean_terminated_length": 769.8690795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.021333333333333333, "grad_norm": 2.31535005569458, "kl": 0.017386436462402344, "learning_rate": 4.75e-06, "loss": -0.0447, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022121768444776535, "mask/share_reasoning": 0.7714666724205017, "mask/share_step_conf": 0.19078657031059265, "num_tokens": 6397095.0, "reward": 0.7693119049072266, "reward_std": 0.15373152494430542, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7524155974388123, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.4487082362174988, "step": 20 }, { "adv/mean_abs_final_conf": 0.7619816660881042, "adv/mean_abs_reasoning": 0.36749473214149475, "adv/mean_abs_step_conf": 0.7683507204055786, "adv/ratio_final_to_reasoning": 2.0734492210210025, "adv/ratio_step_to_reasoning": 2.09078022949658, "adv/std_final_conf": 0.9265251755714417, "adv/std_reasoning": 0.6403968930244446, "adv/std_step_conf": 0.9350817799568176, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.81640625, "calib/ece": 0.1926907630522088, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.01606425702811245, "calib/gap": -0.047142412935323175, "calib/mean_conf": 0.6492369477911647, "calib/mu_c": 0.6401492537313433, "calib/mu_w": 0.6872916666666665, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.017349397590361443, "calib/std_conf": 0.06157096731729032, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5765693130399013, "calib/step_q_c_n": 2431.0, "calib/step_q_gap": -0.06329271227655442, "calib/step_q_w": 0.6398620253164558, "calib/step_q_w_n": 1106.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 707.16796875, "completions/mean_terminated_length": 724.1400146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.0224, "grad_norm": 0.5563187003135681, "kl": 0.024248123168945312, "learning_rate": 5e-06, "loss": -0.0013, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02241862565279007, "mask/share_reasoning": 0.7598947882652283, "mask/share_step_conf": 0.19424910843372345, "num_tokens": 6681090.0, "reward": 0.7932220697402954, "reward_std": 0.20893710851669312, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.7790640592575073, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.45581766963005066, "step": 21 }, { "adv/mean_abs_final_conf": 0.7331605553627014, "adv/mean_abs_reasoning": 0.1721845418214798, "adv/mean_abs_step_conf": 0.7609281539916992, "adv/ratio_final_to_reasoning": 4.257992893013817, "adv/ratio_step_to_reasoning": 4.4192593942644764, "adv/std_final_conf": 0.9263607263565063, "adv/std_reasoning": 0.46741846203804016, "adv/std_step_conf": 0.9356198310852051, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.46484375, "calib/ece": 0.08101562500000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0006332809342016743, "calib/mean_conf": 0.641328125, "calib/mu_c": 0.6411475409836065, "calib/mu_w": 0.6417808219178082, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0037500000000000025, "calib/std_conf": 0.044841231963276565, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5746539027982328, "calib/step_q_c_n": 2037.0, "calib/step_q_gap": -0.0007247163554421343, "calib/step_q_w": 0.5753786191536749, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 684.60546875, "completions/mean_terminated_length": 689.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.023466666666666667, "grad_norm": 0.28116798400878906, "kl": 0.0274505615234375, "learning_rate": 4.9722222222222224e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02279532700777054, "mask/share_reasoning": 0.7776881456375122, "mask/share_step_conf": 0.19170401990413666, "num_tokens": 6958165.0, "reward": 0.7616775631904602, "reward_std": 0.14698928594589233, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7884843945503235, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.3919020891189575, "step": 22 }, { "adv/mean_abs_final_conf": 0.7574082612991333, "adv/mean_abs_reasoning": 0.27920395135879517, "adv/mean_abs_step_conf": 0.7400485277175903, "adv/ratio_final_to_reasoning": 2.712741913619319, "adv/ratio_step_to_reasoning": 2.6505660973493175, "adv/std_final_conf": 0.9250745177268982, "adv/std_reasoning": 0.5725696086883545, "adv/std_step_conf": 0.9337242245674133, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.47265625, "calib/ece": 0.14820312500000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.002167384026781183, "calib/mean_conf": 0.6525781249999999, "calib/mu_c": 0.6521463414634147, "calib/mu_w": 0.6543137254901958, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.06460512380209774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5821149241819633, "calib/step_q_c_n": 2506.0, "calib/step_q_gap": -0.006706036516726632, "calib/step_q_w": 0.5888209606986899, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 760.65625, "completions/mean_terminated_length": 766.6456909179688, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.024533333333333334, "grad_norm": 0.27971068024635315, "kl": 0.02925872802734375, "learning_rate": 4.944444444444445e-06, "loss": -0.006, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022922851145267487, "mask/share_reasoning": 0.779192328453064, "mask/share_step_conf": 0.19007235765457153, "num_tokens": 7256829.0, "reward": 0.8002302646636963, "reward_std": 0.1789833903312683, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8136398792266846, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.42666441202163696, "step": 23 }, { "adv/mean_abs_final_conf": 0.7580659985542297, "adv/mean_abs_reasoning": 0.5364935398101807, "adv/mean_abs_step_conf": 0.7590062618255615, "adv/ratio_final_to_reasoning": 1.4130011683317654, "adv/ratio_step_to_reasoning": 1.4147537770801661, "adv/std_final_conf": 0.9311103820800781, "adv/std_reasoning": 0.7753890752792358, "adv/std_step_conf": 0.936369776725769, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.07421875, "calib/ece": 0.08474308300395252, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": -0.013661558960692988, "calib/mean_conf": 0.6558893280632411, "calib/mu_c": 0.650759493670886, "calib/mu_w": 0.664421052631579, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05806324110671935, "calib/std_conf": 0.07233335491967625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5837024972855592, "calib/step_q_c_n": 1842.0, "calib/step_q_gap": -0.030762618993510515, "calib/step_q_w": 0.6144651162790697, "calib/step_q_w_n": 1505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 770.25390625, "completions/mean_terminated_length": 779.3873901367188, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.0256, "grad_norm": 0.3319094479084015, "kl": 0.032161712646484375, "learning_rate": 4.9166666666666665e-06, "loss": -0.0027, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02267495170235634, "mask/share_reasoning": 0.7736426591873169, "mask/share_step_conf": 0.19196362793445587, "num_tokens": 7558526.0, "reward": 0.6610881090164185, "reward_std": 0.2659045457839966, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7440546751022339, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.257027804851532, "step": 24 }, { "adv/mean_abs_final_conf": 0.7323684692382812, "adv/mean_abs_reasoning": 0.13908778131008148, "adv/mean_abs_step_conf": 0.7595969438552856, "adv/ratio_final_to_reasoning": 5.265512630513124, "adv/ratio_step_to_reasoning": 5.461277307758937, "adv/std_final_conf": 0.9252333641052246, "adv/std_reasoning": 0.40499961376190186, "adv/std_step_conf": 0.9354682564735413, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.078125, "calib/ece": 0.13213438735177863, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.027592477944590632, "calib/mean_conf": 0.6494466403162056, "calib/mu_c": 0.6417032967032966, "calib/mu_w": 0.6692957746478873, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.031106719367588943, "calib/std_conf": 0.05357284316037042, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5738152610441767, "calib/step_q_c_n": 1992.0, "calib/step_q_gap": -0.033366557137641584, "calib/step_q_w": 0.6071818181818183, "calib/step_q_w_n": 1100.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 727.58984375, "completions/mean_terminated_length": 733.3189086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.02666666666666667, "grad_norm": 0.22611305117607117, "kl": 0.039325714111328125, "learning_rate": 4.888888888888889e-06, "loss": -0.0109, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02241634577512741, "mask/share_reasoning": 0.7723724246025085, "mask/share_step_conf": 0.19739872217178345, "num_tokens": 7848013.0, "reward": 0.7145950794219971, "reward_std": 0.156108558177948, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7700910568237305, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.31925538182258606, "step": 25 }, { "adv/mean_abs_final_conf": 0.7534859776496887, "adv/mean_abs_reasoning": 0.2689468264579773, "adv/mean_abs_step_conf": 0.7727804183959961, "adv/ratio_final_to_reasoning": 2.8016169128042128, "adv/ratio_step_to_reasoning": 2.87335763940216, "adv/std_final_conf": 0.9244814515113831, "adv/std_reasoning": 0.5483368635177612, "adv/std_step_conf": 0.9353242516517639, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.46484375, "calib/ece": 0.1467588932806324, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.019890109890109864, "calib/mean_conf": 0.635691699604743, "calib/mu_c": 0.63010989010989, "calib/mu_w": 0.6499999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03154150197628457, "calib/std_conf": 0.038673864517475974, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5706611140031235, "calib/step_q_c_n": 1921.0, "calib/step_q_gap": -0.02551245601660035, "calib/step_q_w": 0.5961735700197238, "calib/step_q_w_n": 1014.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 676.83984375, "completions/mean_terminated_length": 684.8656616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.027733333333333332, "grad_norm": 0.3391321897506714, "kl": 0.0453948974609375, "learning_rate": 4.861111111111111e-06, "loss": 0.0522, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022081192582845688, "mask/share_reasoning": 0.779906690120697, "mask/share_step_conf": 0.186293363571167, "num_tokens": 8126524.0, "reward": 0.740322470664978, "reward_std": 0.18615061044692993, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7724347710609436, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3683663606643677, "step": 26 }, { "adv/mean_abs_final_conf": 0.7368289232254028, "adv/mean_abs_reasoning": 0.2924867272377014, "adv/mean_abs_step_conf": 0.7530105113983154, "adv/ratio_final_to_reasoning": 2.519187554882066, "adv/ratio_step_to_reasoning": 2.5745117342926482, "adv/std_final_conf": 0.9256953001022339, "adv/std_reasoning": 0.5960046648979187, "adv/std_step_conf": 0.9359559416770935, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.2421875, "calib/ece": 0.08752362204724409, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0006899336791647004, "calib/mean_conf": 0.6411377952755906, "calib/mu_c": 0.6414012738853503, "calib/mu_w": 0.6407113402061856, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05527559055118106, "calib/std_conf": 0.052515920989200174, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5727674129353234, "calib/step_q_c_n": 1608.0, "calib/step_q_gap": -0.012955421710345916, "calib/step_q_w": 0.5857228346456693, "calib/step_q_w_n": 1270.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 720.60546875, "completions/mean_terminated_length": 726.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.0288, "grad_norm": 0.2796003818511963, "kl": 0.049957275390625, "learning_rate": 4.833333333333333e-06, "loss": -0.0482, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022812340408563614, "mask/share_reasoning": 0.7833442687988281, "mask/share_step_conf": 0.18603089451789856, "num_tokens": 8416215.0, "reward": 0.7067053318023682, "reward_std": 0.1922786831855774, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.755042314529419, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3372744917869568, "step": 27 }, { "adv/mean_abs_final_conf": 0.7352758049964905, "adv/mean_abs_reasoning": 0.33253324031829834, "adv/mean_abs_step_conf": 0.7541956901550293, "adv/ratio_final_to_reasoning": 2.2111347554087826, "adv/ratio_step_to_reasoning": 2.268030977694497, "adv/std_final_conf": 0.9242193102836609, "adv/std_reasoning": 0.6185281872749329, "adv/std_step_conf": 0.9362373352050781, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.30078125, "calib/ece": 0.1744313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": -0.032871794871795035, "calib/mean_conf": 0.6395294117647058, "calib/mu_c": 0.6317948717948717, "calib/mu_w": 0.6646666666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024627450980392148, "calib/std_conf": 0.06496284094446263, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5762992125984252, "calib/step_q_c_n": 2032.0, "calib/step_q_gap": -0.0264766294457095, "calib/step_q_w": 0.6027758420441347, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 732.38671875, "completions/mean_terminated_length": 738.153564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.029866666666666666, "grad_norm": 0.33571648597717285, "kl": 0.046184539794921875, "learning_rate": 4.805555555555556e-06, "loss": -0.0196, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023265177384018898, "mask/share_reasoning": 0.7898497581481934, "mask/share_step_conf": 0.1790725439786911, "num_tokens": 8710650.0, "reward": 0.7391840219497681, "reward_std": 0.20819517970085144, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7852710485458374, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.34153443574905396, "step": 28 }, { "adv/mean_abs_final_conf": 0.7568809390068054, "adv/mean_abs_reasoning": 0.33271366357803345, "adv/mean_abs_step_conf": 0.7712277770042419, "adv/ratio_final_to_reasoning": 2.2748718248214934, "adv/ratio_step_to_reasoning": 2.317992500549533, "adv/std_final_conf": 0.9278278350830078, "adv/std_reasoning": 0.6184771060943604, "adv/std_step_conf": 0.9358929395675659, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.52734375, "calib/ece": 0.0757905138339921, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.013584249084249045, "calib/mean_conf": 0.6536166007905139, "calib/mu_c": 0.6494285714285715, "calib/mu_w": 0.6630128205128205, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018853754940711443, "calib/std_conf": 0.059171346629624394, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5832805219012116, "calib/step_q_c_n": 2146.0, "calib/step_q_gap": -0.005145491293887305, "calib/step_q_w": 0.5884260131950989, "calib/step_q_w_n": 1061.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 795.890625, "completions/mean_terminated_length": 805.328125, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.030933333333333334, "grad_norm": 0.282016396522522, "kl": 0.0450897216796875, "learning_rate": 4.777777777777778e-06, "loss": -0.0624, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020920034497976303, "mask/share_reasoning": 0.7820408940315247, "mask/share_step_conf": 0.18532030284404755, "num_tokens": 9021526.0, "reward": 0.7136837244033813, "reward_std": 0.19021692872047424, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.766909658908844, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3260827660560608, "step": 29 }, { "adv/mean_abs_final_conf": 0.7519515752792358, "adv/mean_abs_reasoning": 0.3583611845970154, "adv/mean_abs_step_conf": 0.7747514843940735, "adv/ratio_final_to_reasoning": 2.098306422680294, "adv/ratio_step_to_reasoning": 2.1619291309835846, "adv/std_final_conf": 0.9269018173217773, "adv/std_reasoning": 0.6402735710144043, "adv/std_step_conf": 0.9351660013198853, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.05859375, "calib/ece": 0.1345634920634921, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.014091233071988674, "calib/mean_conf": 0.6510714285714285, "calib/mu_c": 0.6472131147540984, "calib/mu_w": 0.661304347826087, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02972222222222222, "calib/std_conf": 0.060310397978361824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.585917667238422, "calib/step_q_c_n": 2332.0, "calib/step_q_gap": -0.018859780832794715, "calib/step_q_w": 0.6047774480712167, "calib/step_q_w_n": 1011.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 799.64453125, "completions/mean_terminated_length": 812.3373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.032, "grad_norm": 0.4014255404472351, "kl": 0.047332763671875, "learning_rate": 4.75e-06, "loss": -0.0524, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020229246467351913, "mask/share_reasoning": 0.7831885814666748, "mask/share_step_conf": 0.18095718324184418, "num_tokens": 9333219.0, "reward": 0.720932126045227, "reward_std": 0.19922210276126862, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7739925384521484, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3280279040336609, "step": 30 }, { "adv/mean_abs_final_conf": 0.7201747894287109, "adv/mean_abs_reasoning": 0.2836382985115051, "adv/mean_abs_step_conf": 0.742740273475647, "adv/ratio_final_to_reasoning": 2.5390604625965163, "adv/ratio_step_to_reasoning": 2.6186177162021, "adv/std_final_conf": 0.9259783029556274, "adv/std_reasoning": 0.5726578831672668, "adv/std_step_conf": 0.9356769919395447, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.1796875, "calib/ece": 0.06511811023622044, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.001478797638217788, "calib/mean_conf": 0.6463779527559056, "calib/mu_c": 0.6469135802469136, "calib/mu_w": 0.6454347826086958, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03685039370078735, "calib/std_conf": 0.06318538268290515, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5829363024339721, "calib/step_q_c_n": 1931.0, "calib/step_q_gap": -0.00392132182887539, "calib/step_q_w": 0.5868576242628475, "calib/step_q_w_n": 1187.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 754.67578125, "completions/mean_terminated_length": 763.62451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 0.03306666666666667, "grad_norm": 2.021209716796875, "kl": 0.055149078369140625, "learning_rate": 4.722222222222222e-06, "loss": -0.0723, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021880831569433212, "mask/share_reasoning": 0.7856607437133789, "mask/share_step_conf": 0.18073971569538116, "num_tokens": 9632328.0, "reward": 0.6749727725982666, "reward_std": 0.17479467391967773, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7596234083175659, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.26532211899757385, "step": 31 }, { "adv/mean_abs_final_conf": 0.7195700407028198, "adv/mean_abs_reasoning": 0.30481335520744324, "adv/mean_abs_step_conf": 0.7410525679588318, "adv/ratio_final_to_reasoning": 2.3606906600700306, "adv/ratio_step_to_reasoning": 2.4311683044678354, "adv/std_final_conf": 0.9117735624313354, "adv/std_reasoning": 0.5960460901260376, "adv/std_step_conf": 0.9353088140487671, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.48828125, "calib/ece": 0.04129921259842524, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.017254580077483705, "calib/mean_conf": 0.6309842519685039, "calib/mu_c": 0.6243949044585987, "calib/mu_w": 0.6416494845360824, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027086614173228385, "calib/std_conf": 0.038354641077149774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5667146282973621, "calib/step_q_c_n": 1668.0, "calib/step_q_gap": -0.020873745622512208, "calib/step_q_w": 0.5875883739198743, "calib/step_q_w_n": 1273.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 668.54296875, "completions/mean_terminated_length": 676.4703979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.034133333333333335, "grad_norm": 0.29071611166000366, "kl": 0.06232452392578125, "learning_rate": 4.694444444444445e-06, "loss": -0.0108, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02311065047979355, "mask/share_reasoning": 0.7838659882545471, "mask/share_step_conf": 0.18130461871623993, "num_tokens": 9910179.0, "reward": 0.6577374935150146, "reward_std": 0.17762039601802826, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7482753992080688, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.2461058646440506, "step": 32 }, { "adv/mean_abs_final_conf": 0.7352038621902466, "adv/mean_abs_reasoning": 0.2771219313144684, "adv/mean_abs_step_conf": 0.7629679441452026, "adv/ratio_final_to_reasoning": 2.6529977569908123, "adv/ratio_step_to_reasoning": 2.7531849988423076, "adv/std_final_conf": 0.9094998240470886, "adv/std_reasoning": 0.5483477115631104, "adv/std_step_conf": 0.9357249736785889, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.3984375, "calib/ece": 0.042862745098039265, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0034060534591194846, "calib/mean_conf": 0.6263137254901961, "calib/mu_c": 0.6250314465408805, "calib/mu_w": 0.6284375, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02282352941176473, "calib/std_conf": 0.04399353348029126, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5658475110270952, "calib/step_q_c_n": 1587.0, "calib/step_q_gap": -0.011175744786858233, "calib/step_q_w": 0.5770232558139534, "calib/step_q_w_n": 1075.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 651.59765625, "completions/mean_terminated_length": 656.7283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.0352, "grad_norm": 0.2806883156299591, "kl": 0.0635223388671875, "learning_rate": 4.666666666666667e-06, "loss": -0.018, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.025103997439146042, "mask/share_reasoning": 0.7836605310440063, "mask/share_step_conf": 0.18342293798923492, "num_tokens": 10183860.0, "reward": 0.691365122795105, "reward_std": 0.17519938945770264, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7587417960166931, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3005508780479431, "step": 33 }, { "adv/mean_abs_final_conf": 0.7703639268875122, "adv/mean_abs_reasoning": 0.40508854389190674, "adv/mean_abs_step_conf": 0.7641962766647339, "adv/ratio_final_to_reasoning": 1.9017173862440184, "adv/ratio_step_to_reasoning": 1.8864919489519085, "adv/std_final_conf": 0.9283607602119446, "adv/std_reasoning": 0.6612812876701355, "adv/std_step_conf": 0.9356377124786377, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.73046875, "calib/ece": 0.11338582677165357, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0071595394736843065, "calib/mean_conf": 0.635511811023622, "calib/mu_c": 0.6373157894736843, "calib/mu_w": 0.63015625, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.00043307086614173264, "calib/std_conf": 0.05153866618829232, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5749976862563627, "calib/step_q_c_n": 2161.0, "calib/step_q_gap": -0.014384736546487642, "calib/step_q_w": 0.5893824228028504, "calib/step_q_w_n": 842.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2060.0, "completions/max_terminated_length": 2060.0, "completions/mean_length": 647.30078125, "completions/mean_terminated_length": 654.976318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.03626666666666667, "grad_norm": 0.36007630825042725, "kl": 0.0668487548828125, "learning_rate": 4.638888888888889e-06, "loss": 0.0003, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02514474466443062, "mask/share_reasoning": 0.7692458033561707, "mask/share_step_conf": 0.19389072060585022, "num_tokens": 10454681.0, "reward": 0.7932491302490234, "reward_std": 0.20490731298923492, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.792660117149353, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.4469631314277649, "step": 34 }, { "adv/mean_abs_final_conf": 0.7443052530288696, "adv/mean_abs_reasoning": 0.3177647888660431, "adv/mean_abs_step_conf": 0.7797359228134155, "adv/ratio_final_to_reasoning": 2.3423150679625455, "adv/ratio_step_to_reasoning": 2.4538147401288093, "adv/std_final_conf": 0.9288173317909241, "adv/std_reasoning": 0.618408203125, "adv/std_step_conf": 0.9358792901039124, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.7578125, "calib/ece": 0.08882352941176469, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.003586220629231196, "calib/mean_conf": 0.6497647058823528, "calib/mu_c": 0.6484567901234568, "calib/mu_w": 0.652043010752688, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05164705882352938, "calib/std_conf": 0.05471444166647668, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5834260178748759, "calib/step_q_c_n": 2014.0, "calib/step_q_gap": -0.002851937396689541, "calib/step_q_w": 0.5862779552715655, "calib/step_q_w_n": 1252.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 806.93359375, "completions/mean_terminated_length": 813.2874145507812, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 0.037333333333333336, "grad_norm": 0.2525840699672699, "kl": 0.05096435546875, "learning_rate": 4.611111111111112e-06, "loss": -0.0398, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020500820130109787, "mask/share_reasoning": 0.7965471148490906, "mask/share_step_conf": 0.17513957619667053, "num_tokens": 10770512.0, "reward": 0.6974085569381714, "reward_std": 0.17602644860744476, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7604573965072632, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3085784614086151, "step": 35 }, { "adv/mean_abs_final_conf": 0.7312940359115601, "adv/mean_abs_reasoning": 0.340577095746994, "adv/mean_abs_step_conf": 0.721725344657898, "adv/ratio_final_to_reasoning": 2.147220247760935, "adv/ratio_step_to_reasoning": 2.119124725856636, "adv/std_final_conf": 0.9102586507797241, "adv/std_reasoning": 0.6185452938079834, "adv/std_step_conf": 0.9327031970024109, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.02734375, "calib/ece": 0.19439215686274505, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0021269841269840883, "calib/mean_conf": 0.6291372549019607, "calib/mu_c": 0.6287619047619049, "calib/mu_w": 0.6308888888888889, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.05036378235256556, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.571522506619594, "calib/step_q_c_n": 2266.0, "calib/step_q_gap": -0.013935303075199523, "calib/step_q_w": 0.5854578096947936, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 614.36328125, "completions/mean_terminated_length": 619.2008056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.0384, "grad_norm": 0.6964386701583862, "kl": 0.0863037109375, "learning_rate": 4.583333333333333e-06, "loss": -0.0322, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02720923349261284, "mask/share_reasoning": 0.7675433158874512, "mask/share_step_conf": 0.1974349170923233, "num_tokens": 11030501.0, "reward": 0.8428558111190796, "reward_std": 0.20296572148799896, "rewards/accuracy_reward_step": 0.8203125, "rewards/final_brier_reward_step": 0.8105496168136597, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.51188063621521, "step": 36 }, { "adv/mean_abs_final_conf": 0.6925872564315796, "adv/mean_abs_reasoning": 0.2421915978193283, "adv/mean_abs_step_conf": 0.7570322751998901, "adv/ratio_final_to_reasoning": 2.859666737688565, "adv/ratio_step_to_reasoning": 3.1257577967862704, "adv/std_final_conf": 0.9100953936576843, "adv/std_reasoning": 0.5482887625694275, "adv/std_step_conf": 0.93547523021698, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 11.7265625, "calib/ece": 0.16091269841269842, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.03623740753786542, "calib/mean_conf": 0.6321031746031744, "calib/mu_c": 0.6198802395209582, "calib/mu_w": 0.6561176470588236, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06515873015873017, "calib/std_conf": 0.06378863029569816, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5700119904076738, "calib/step_q_c_n": 1668.0, "calib/step_q_gap": -0.03401349684869803, "calib/step_q_w": 0.6040254872563718, "calib/step_q_w_n": 1334.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 673.62109375, "completions/mean_terminated_length": 681.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.039466666666666664, "grad_norm": 0.228117436170578, "kl": 0.06999969482421875, "learning_rate": 4.555555555555556e-06, "loss": -0.0816, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.025216789916157722, "mask/share_reasoning": 0.7774437069892883, "mask/share_step_conf": 0.1856207549571991, "num_tokens": 11310044.0, "reward": 0.7160718441009521, "reward_std": 0.171960711479187, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.743464469909668, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3613356053829193, "step": 37 }, { "adv/mean_abs_final_conf": 0.75675368309021, "adv/mean_abs_reasoning": 0.33615952730178833, "adv/mean_abs_step_conf": 0.7887274026870728, "adv/ratio_final_to_reasoning": 2.2511742837228343, "adv/ratio_step_to_reasoning": 2.346289004562379, "adv/std_final_conf": 0.9278170466423035, "adv/std_reasoning": 0.6186119318008423, "adv/std_step_conf": 0.9357684254646301, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.66015625, "calib/ece": 0.12379446640316209, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03058963871847309, "calib/mean_conf": 0.646403162055336, "calib/mu_c": 0.635521472392638, "calib/mu_w": 0.6661111111111111, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06296442687747035, "calib/std_conf": 0.05874937986285626, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5749078726968174, "calib/step_q_c_n": 1791.0, "calib/step_q_gap": -0.03290592040663087, "calib/step_q_w": 0.6078137931034483, "calib/step_q_w_n": 1450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 706.0234375, "completions/mean_terminated_length": 714.395263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.04053333333333333, "grad_norm": 0.322736531496048, "kl": 0.07154083251953125, "learning_rate": 4.527777777777778e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02300211228430271, "mask/share_reasoning": 0.7782649993896484, "mask/share_step_conf": 0.18701410293579102, "num_tokens": 11597674.0, "reward": 0.6569255590438843, "reward_std": 0.20873770117759705, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7445077896118164, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2443433701992035, "step": 38 }, { "adv/mean_abs_final_conf": 0.7578836679458618, "adv/mean_abs_reasoning": 0.31737250089645386, "adv/mean_abs_step_conf": 0.7529335021972656, "adv/ratio_final_to_reasoning": 2.3879941261613253, "adv/ratio_step_to_reasoning": 2.3723967894840334, "adv/std_final_conf": 0.9278097152709961, "adv/std_reasoning": 0.5960695743560791, "adv/std_step_conf": 0.9358320236206055, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.0390625, "calib/ece": 0.10490196078431367, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.03607945835462456, "calib/mean_conf": 0.6428627450980392, "calib/mu_c": 0.6282894736842105, "calib/mu_w": 0.6643689320388351, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0758431372549019, "calib/std_conf": 0.05763339840814769, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.569218958611482, "calib/step_q_c_n": 1498.0, "calib/step_q_gap": -0.03725200098447767, "calib/step_q_w": 0.6064709595959596, "calib/step_q_w_n": 1584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 697.52734375, "completions/mean_terminated_length": 703.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.0416, "grad_norm": 0.438943475484848, "kl": 0.07904052734375, "learning_rate": 4.5e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.024240538477897644, "mask/share_reasoning": 0.778479814529419, "mask/share_step_conf": 0.1894671469926834, "num_tokens": 11882329.0, "reward": 0.6734088659286499, "reward_std": 0.19206441938877106, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7334707379341125, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2953784167766571, "step": 39 }, { "adv/mean_abs_final_conf": 0.7486369609832764, "adv/mean_abs_reasoning": 0.34529808163642883, "adv/mean_abs_step_conf": 0.7777887582778931, "adv/ratio_final_to_reasoning": 2.1680889665976513, "adv/ratio_step_to_reasoning": 2.2525139861531063, "adv/std_final_conf": 0.930046021938324, "adv/std_reasoning": 0.6402245163917542, "adv/std_step_conf": 0.9359531402587891, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.23046875, "calib/ece": 0.08637795275590554, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.028257229832572506, "calib/mean_conf": 0.6534645669291338, "calib/mu_c": 0.6654794520547946, "calib/mu_w": 0.6372222222222221, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08251968503937011, "calib/std_conf": 0.06059908204335681, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5963791423001951, "calib/step_q_c_n": 2052.0, "calib/step_q_gap": 0.010341689116674546, "calib/step_q_w": 0.5860374531835205, "calib/step_q_w_n": 1335.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 745.60546875, "completions/mean_terminated_length": 754.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.042666666666666665, "grad_norm": 0.7285353541374207, "kl": 0.07080078125, "learning_rate": 4.472222222222223e-06, "loss": -0.071, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.023201102390885353, "mask/share_reasoning": 0.775591254234314, "mask/share_step_conf": 0.18948885798454285, "num_tokens": 12179964.0, "reward": 0.6214379072189331, "reward_std": 0.20891620218753815, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.753614068031311, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.17676173150539398, "step": 40 }, { "adv/mean_abs_final_conf": 0.7398024797439575, "adv/mean_abs_reasoning": 0.25526443123817444, "adv/mean_abs_step_conf": 0.7453057765960693, "adv/ratio_final_to_reasoning": 2.898180824314238, "adv/ratio_step_to_reasoning": 2.9197400240249762, "adv/std_final_conf": 0.9244346618652344, "adv/std_reasoning": 0.548233687877655, "adv/std_step_conf": 0.9355214834213257, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.59765625, "calib/ece": 0.21854901960784326, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.029637999122422154, "calib/mean_conf": 0.6309411764705882, "calib/mu_c": 0.625943396226415, "calib/mu_w": 0.6555813953488372, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009058823529411762, "calib/std_conf": 0.047994040383587296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.56872424375274, "calib/step_q_c_n": 2281.0, "calib/step_q_gap": -0.03498215159609719, "calib/step_q_w": 0.6037063953488372, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 646.95703125, "completions/mean_terminated_length": 652.0512084960938, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.04373333333333333, "grad_norm": 0.22850604355335236, "kl": 0.07393646240234375, "learning_rate": 4.444444444444444e-06, "loss": -0.015, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.026456307619810104, "mask/share_reasoning": 0.7719156742095947, "mask/share_step_conf": 0.19381554424762726, "num_tokens": 12452833.0, "reward": 0.8365099430084229, "reward_std": 0.1689138412475586, "rewards/accuracy_reward_step": 0.828125, "rewards/final_brier_reward_step": 0.8058613538742065, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.5023148059844971, "step": 41 }, { "adv/mean_abs_final_conf": 0.6996697187423706, "adv/mean_abs_reasoning": 0.18548455834388733, "adv/mean_abs_step_conf": 0.740502119064331, "adv/ratio_final_to_reasoning": 3.7721184177778664, "adv/ratio_step_to_reasoning": 3.992257499362531, "adv/std_final_conf": 0.9113990068435669, "adv/std_reasoning": 0.49572163820266724, "adv/std_step_conf": 0.9348853826522827, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.6875, "calib/ece": 0.10398437499999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0032569470879328932, "calib/mean_conf": 0.626015625, "calib/mu_c": 0.6269189189189188, "calib/mu_w": 0.6236619718309859, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003671875000000004, "calib/std_conf": 0.04343349520657273, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.568086513994911, "calib/step_q_c_n": 1965.0, "calib/step_q_gap": 5.797962396414391e-05, "calib/step_q_w": 0.5680285343709468, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 572.265625, "completions/mean_terminated_length": 576.7716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.0448, "grad_norm": 0.22524091601371765, "kl": 0.0933380126953125, "learning_rate": 4.416666666666667e-06, "loss": 0.0403, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.026845600455999374, "mask/share_reasoning": 0.76666259765625, "mask/share_step_conf": 0.19867932796478271, "num_tokens": 12703701.0, "reward": 0.7134827375411987, "reward_std": 0.12991458177566528, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7896554470062256, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.29277873039245605, "step": 42 }, { "adv/mean_abs_final_conf": 0.754668116569519, "adv/mean_abs_reasoning": 0.4721015691757202, "adv/mean_abs_step_conf": 0.7553716897964478, "adv/ratio_final_to_reasoning": 1.598529142546919, "adv/ratio_step_to_reasoning": 1.6000194430942296, "adv/std_final_conf": 0.9316073656082153, "adv/std_reasoning": 0.7391892075538635, "adv/std_step_conf": 0.935806930065155, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.58984375, "calib/ece": 0.10503906249999993, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.008032736962314546, "calib/mean_conf": 0.6443359375, "calib/mu_c": 0.642108108108108, "calib/mu_w": 0.6501408450704226, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013359374999999968, "calib/std_conf": 0.054489359245600374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5799176062445793, "calib/step_q_c_n": 2306.0, "calib/step_q_gap": -0.009733429742334598, "calib/step_q_w": 0.5896510359869139, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 697.98046875, "completions/mean_terminated_length": 703.4763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 227.0, "epoch": 0.04586666666666667, "grad_norm": 0.625836193561554, "kl": 0.0879974365234375, "learning_rate": 4.388888888888889e-06, "loss": -0.007, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.024772757664322853, "mask/share_reasoning": 0.7765905857086182, "mask/share_step_conf": 0.1908242106437683, "num_tokens": 12987608.0, "reward": 0.7277460098266602, "reward_std": 0.20869551599025726, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7872527241706848, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.32370805740356445, "step": 43 }, { "adv/mean_abs_final_conf": 0.7820515036582947, "adv/mean_abs_reasoning": 0.369723379611969, "adv/mean_abs_step_conf": 0.770028293132782, "adv/ratio_final_to_reasoning": 2.1152341095634015, "adv/ratio_step_to_reasoning": 2.0827146336835387, "adv/std_final_conf": 0.929624080657959, "adv/std_reasoning": 0.6402297019958496, "adv/std_step_conf": 0.9356967210769653, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.5078125, "calib/ece": 0.09227450980392164, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.006852781815778863, "calib/mean_conf": 0.6474117647058825, "calib/mu_c": 0.6499378881987576, "calib/mu_w": 0.6430851063829788, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.054156862745098115, "calib/std_conf": 0.056564790708233025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5848204623708805, "calib/step_q_c_n": 2033.0, "calib/step_q_gap": 0.009747750651462161, "calib/step_q_w": 0.5750727117194183, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 721.84765625, "completions/mean_terminated_length": 727.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.046933333333333334, "grad_norm": 0.23728294670581818, "kl": 0.076202392578125, "learning_rate": 4.361111111111112e-06, "loss": 0.0242, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02301742322742939, "mask/share_reasoning": 0.7762603759765625, "mask/share_step_conf": 0.19290973246097565, "num_tokens": 13278721.0, "reward": 0.6986934542655945, "reward_std": 0.18942584097385406, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7639957070350647, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.308391273021698, "step": 44 }, { "adv/mean_abs_final_conf": 0.7148399353027344, "adv/mean_abs_reasoning": 0.35291802883148193, "adv/mean_abs_step_conf": 0.7707694172859192, "adv/ratio_final_to_reasoning": 2.0255126598932405, "adv/ratio_step_to_reasoning": 2.183989919239748, "adv/std_final_conf": 0.9121301770210266, "adv/std_reasoning": 0.6401578187942505, "adv/std_step_conf": 0.9362253546714783, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.640625, "calib/ece": 0.05878906250000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.025874628165130864, "calib/mean_conf": 0.6404296875, "calib/mu_c": 0.6482122905027934, "calib/mu_w": 0.6223376623376625, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.05550382638748741, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5797810898928738, "calib/step_q_c_n": 2147.0, "calib/step_q_gap": 0.01256620393849206, "calib/step_q_w": 0.5672148859543817, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 679.6484375, "completions/mean_terminated_length": 685.0, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.048, "grad_norm": 8878.478515625, "kl": 136.07752990722656, "learning_rate": 4.333333333333334e-06, "loss": 1.7789, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.024658601731061935, "mask/share_reasoning": 0.7740293741226196, "mask/share_step_conf": 0.19349952042102814, "num_tokens": 13557759.0, "reward": 0.6431659460067749, "reward_std": 0.2364698052406311, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7940347194671631, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.1524534672498703, "step": 45 }, { "adv/mean_abs_final_conf": 0.7531983256340027, "adv/mean_abs_reasoning": 0.4143233895301819, "adv/mean_abs_step_conf": 0.7525116205215454, "adv/ratio_final_to_reasoning": 1.817899603708313, "adv/ratio_step_to_reasoning": 1.8162421903693367, "adv/std_final_conf": 0.927490234375, "adv/std_reasoning": 0.6816543936729431, "adv/std_step_conf": 0.9355647563934326, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 12.65625, "calib/ece": 0.08423886639676122, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.012145748987854251, "calib/gap": -0.028574203914694674, "calib/mean_conf": 0.6410242914979758, "calib/mu_c": 0.6313067484662577, "calib/mu_w": 0.6598809523809523, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03267206477732801, "calib/std_conf": 0.08169286447591004, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5812162309368192, "calib/step_q_c_n": 1836.0, "calib/step_q_gap": -0.04324245852187025, "calib/step_q_w": 0.6244586894586894, "calib/step_q_w_n": 1404.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 683.46875, "completions/mean_terminated_length": 708.3724975585938, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.04906666666666667, "grad_norm": 0.28518831729888916, "kl": 0.08718109130859375, "learning_rate": 4.305555555555556e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.025916893035173416, "mask/share_reasoning": 0.7426183819770813, "mask/share_step_conf": 0.1963084638118744, "num_tokens": 13837495.0, "reward": 0.6834006309509277, "reward_std": 0.19034671783447266, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.729149580001831, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3173391819000244, "step": 46 }, { "adv/mean_abs_final_conf": 0.7467596530914307, "adv/mean_abs_reasoning": 0.1418902724981308, "adv/mean_abs_step_conf": 0.7821815609931946, "adv/ratio_final_to_reasoning": 5.262937620345102, "adv/ratio_step_to_reasoning": 5.512580582319332, "adv/std_final_conf": 0.9253977537155151, "adv/std_reasoning": 0.4049662947654724, "adv/std_step_conf": 0.9351449608802795, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.69921875, "calib/ece": 0.18429133858267724, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.039278743855015064, "calib/mean_conf": 0.6492519685039368, "calib/mu_c": 0.6373446327683615, "calib/mu_w": 0.6766233766233766, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06834645669291337, "calib/std_conf": 0.05385010816765645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5702151639344262, "calib/step_q_c_n": 1952.0, "calib/step_q_gap": -0.03858391227804503, "calib/step_q_w": 0.6087990762124712, "calib/step_q_w_n": 1299.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 742.6328125, "completions/mean_terminated_length": 751.4387817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.050133333333333335, "grad_norm": 0.12521328032016754, "kl": 0.075958251953125, "learning_rate": 4.277777777777778e-06, "loss": -0.0114, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021606799215078354, "mask/share_reasoning": 0.7760756015777588, "mask/share_step_conf": 0.19059887528419495, "num_tokens": 14133585.0, "reward": 0.6950026154518127, "reward_std": 0.12959975004196167, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7609972357749939, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.29228928685188293, "step": 47 }, { "adv/mean_abs_final_conf": 0.7752068042755127, "adv/mean_abs_reasoning": 0.34300339221954346, "adv/mean_abs_step_conf": 0.7662767171859741, "adv/ratio_final_to_reasoning": 2.260055794956489, "adv/ratio_step_to_reasoning": 2.234020813110529, "adv/std_final_conf": 0.929909348487854, "adv/std_reasoning": 0.618562638759613, "adv/std_step_conf": 0.9360288381576538, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.26953125, "calib/ece": 0.11050980392156864, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.017300155082475843, "calib/mean_conf": 0.631921568627451, "calib/mu_c": 0.6263583815028901, "calib/mu_w": 0.643658536585366, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.032, "calib/std_conf": 0.05036176698657422, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5689640591966173, "calib/step_q_c_n": 1892.0, "calib/step_q_gap": -0.01274792469059316, "calib/step_q_w": 0.5817119838872105, "calib/step_q_w_n": 993.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 650.8046875, "completions/mean_terminated_length": 655.9291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.0512, "grad_norm": 0.2294154167175293, "kl": 0.092254638671875, "learning_rate": 4.25e-06, "loss": -0.0293, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.026431187987327576, "mask/share_reasoning": 0.7731444239616394, "mask/share_step_conf": 0.19261188805103302, "num_tokens": 14403879.0, "reward": 0.7620805501937866, "reward_std": 0.21728140115737915, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7665836215019226, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4232025742530823, "step": 48 }, { "adv/mean_abs_final_conf": 0.7520368099212646, "adv/mean_abs_reasoning": 0.3485822379589081, "adv/mean_abs_step_conf": 0.752032995223999, "adv/ratio_final_to_reasoning": 2.1574157487907257, "adv/ratio_step_to_reasoning": 2.1574048053264576, "adv/std_final_conf": 0.9283875823020935, "adv/std_reasoning": 0.6185300946235657, "adv/std_step_conf": 0.9361112713813782, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 10.546875, "calib/ece": 0.09782608695652174, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0036701964395333686, "calib/mean_conf": 0.6333201581027669, "calib/mu_c": 0.6343646408839778, "calib/mu_w": 0.6306944444444444, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007865612648221346, "calib/std_conf": 0.041082730914846016, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5679769994772609, "calib/step_q_c_n": 1913.0, "calib/step_q_gap": -0.0016163931529805664, "calib/step_q_w": 0.5695933926302414, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 644.6171875, "completions/mean_terminated_length": 654.8492431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 261.0, "epoch": 0.05226666666666667, "grad_norm": 0.2470749318599701, "kl": 0.08589935302734375, "learning_rate": 4.222222222222223e-06, "loss": 0.0081, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.024077853187918663, "mask/share_reasoning": 0.7702655792236328, "mask/share_step_conf": 0.19003157317638397, "num_tokens": 14673437.0, "reward": 0.701884388923645, "reward_std": 0.2006235271692276, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7802191376686096, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2844870388507843, "step": 49 }, { "adv/mean_abs_final_conf": 0.7255159616470337, "adv/mean_abs_reasoning": 0.3110960125923157, "adv/mean_abs_step_conf": 0.7348172068595886, "adv/ratio_final_to_reasoning": 2.3321287714407513, "adv/ratio_step_to_reasoning": 2.36202708204605, "adv/std_final_conf": 0.9274781346321106, "adv/std_reasoning": 0.6185328960418701, "adv/std_step_conf": 0.9359754920005798, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.80859375, "calib/ece": 0.12241106719367606, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": 0.005098189890710647, "calib/mean_conf": 0.6428853754940712, "calib/mu_c": 0.6441145833333334, "calib/mu_w": 0.6390163934426227, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003201581027667973, "calib/std_conf": 0.06874923430085506, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5843965517241381, "calib/step_q_c_n": 2204.0, "calib/step_q_gap": -0.0059819586543722325, "calib/step_q_w": 0.5903785103785103, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 699.2890625, "completions/mean_terminated_length": 704.7952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.05333333333333334, "grad_norm": 0.2760617136955261, "kl": 0.08426666259765625, "learning_rate": 4.194444444444445e-06, "loss": -0.0539, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02490140125155449, "mask/share_reasoning": 0.7720136046409607, "mask/share_step_conf": 0.19527247548103333, "num_tokens": 14957815.0, "reward": 0.7315381169319153, "reward_std": 0.17188885807991028, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7913238406181335, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.324096143245697, "step": 50 }, { "adv/mean_abs_final_conf": 0.6964578628540039, "adv/mean_abs_reasoning": 0.3133673369884491, "adv/mean_abs_step_conf": 0.7605372667312622, "adv/ratio_final_to_reasoning": 2.222496669714099, "adv/ratio_step_to_reasoning": 2.4269832141417345, "adv/std_final_conf": 0.9126064777374268, "adv/std_reasoning": 0.6184675693511963, "adv/std_step_conf": 0.9359240531921387, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.35546875, "calib/ece": 0.18976377952755905, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0368697614442296, "calib/mean_conf": 0.6375590551181102, "calib/mu_c": 0.6279787234042552, "calib/mu_w": 0.6648484848484848, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04358267716535434, "calib/std_conf": 0.055033512130109555, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5662160751565762, "calib/step_q_c_n": 1916.0, "calib/step_q_gap": -0.03752761808257621, "calib/step_q_w": 0.6037436932391524, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 722.0390625, "completions/mean_terminated_length": 724.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.0544, "grad_norm": 0.33014073967933655, "kl": 0.07884979248046875, "learning_rate": 4.166666666666667e-06, "loss": 0.0451, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.023661799728870392, "mask/share_reasoning": 0.7851213216781616, "mask/share_step_conf": 0.18731063604354858, "num_tokens": 15251953.0, "reward": 0.7580503821372986, "reward_std": 0.19549910724163055, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7738453149795532, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.39694297313690186, "step": 51 }, { "adv/mean_abs_final_conf": 0.7437102794647217, "adv/mean_abs_reasoning": 0.19842737913131714, "adv/mean_abs_step_conf": 0.7377064824104309, "adv/ratio_final_to_reasoning": 3.74802248923795, "adv/ratio_step_to_reasoning": 3.7177655908171046, "adv/std_final_conf": 0.927570104598999, "adv/std_reasoning": 0.4959178566932678, "adv/std_step_conf": 0.936040997505188, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.62890625, "calib/ece": 0.17385826771653545, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.023765092243794084, "calib/mean_conf": 0.6466929133858267, "calib/mu_c": 0.6419211822660099, "calib/mu_w": 0.665686274509804, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010669291338582677, "calib/std_conf": 0.06702984548692002, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5801745747538049, "calib/step_q_c_n": 2234.0, "calib/step_q_gap": -0.019771589445387594, "calib/step_q_w": 0.5999461641991924, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 739.28515625, "completions/mean_terminated_length": 745.1063232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.055466666666666664, "grad_norm": 0.4081308841705322, "kl": 0.08150482177734375, "learning_rate": 4.138888888888889e-06, "loss": -0.0239, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.024338986724615097, "mask/share_reasoning": 0.7771621346473694, "mask/share_step_conf": 0.1906864047050476, "num_tokens": 15549162.0, "reward": 0.7791553139686584, "reward_std": 0.1678694188594818, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.797863245010376, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.4034160375595093, "step": 52 }, { "adv/mean_abs_final_conf": 0.7358145117759705, "adv/mean_abs_reasoning": 0.34003329277038574, "adv/mean_abs_step_conf": 0.7458301782608032, "adv/ratio_final_to_reasoning": 2.16394843511057, "adv/ratio_step_to_reasoning": 2.1934033934860606, "adv/std_final_conf": 0.9298948049545288, "adv/std_reasoning": 0.6401073932647705, "adv/std_step_conf": 0.9360513687133789, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.49609375, "calib/ece": 0.14941406250000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004433285509325646, "calib/mean_conf": 0.6552734375, "calib/mu_c": 0.654390243902439, "calib/mu_w": 0.6588235294117647, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.001953125000000001, "calib/std_conf": 0.05525441368735708, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.58257016840417, "calib/step_q_c_n": 2494.0, "calib/step_q_gap": -0.016536214574553454, "calib/step_q_w": 0.5991063829787234, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 771.90234375, "completions/mean_terminated_length": 777.9802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.05653333333333333, "grad_norm": 0.21992897987365723, "kl": 0.0755157470703125, "learning_rate": 4.111111111111111e-06, "loss": -0.0655, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020725980401039124, "mask/share_reasoning": 0.7839224338531494, "mask/share_step_conf": 0.18753905594348907, "num_tokens": 15852593.0, "reward": 0.7674970626831055, "reward_std": 0.20992466807365417, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8148292899131775, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.3600085973739624, "step": 53 }, { "adv/mean_abs_final_conf": 0.7923758029937744, "adv/mean_abs_reasoning": 0.2119789868593216, "adv/mean_abs_step_conf": 0.7696312665939331, "adv/ratio_final_to_reasoning": 3.737992216745659, "adv/ratio_step_to_reasoning": 3.6306960326435265, "adv/std_final_conf": 0.9255867004394531, "adv/std_reasoning": 0.467656672000885, "adv/std_step_conf": 0.9360727667808533, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.70703125, "calib/ece": 0.21914453125000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.013305408015451659, "calib/mean_conf": 0.6339882812500001, "calib/mu_c": 0.6359633027522935, "calib/mu_w": 0.6226578947368419, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0007851562499999982, "calib/std_conf": 0.06745568466720131, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.577243031358885, "calib/step_q_c_n": 2296.0, "calib/step_q_gap": -0.016914272011901432, "calib/step_q_w": 0.5941573033707864, "calib/step_q_w_n": 445.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 667.765625, "completions/mean_terminated_length": 673.0236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.0576, "grad_norm": 0.3071500360965729, "kl": 0.1081085205078125, "learning_rate": 4.083333333333334e-06, "loss": 0.0084, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02563382312655449, "mask/share_reasoning": 0.777256965637207, "mask/share_step_conf": 0.189296692609787, "num_tokens": 16129773.0, "reward": 0.8549725413322449, "reward_std": 0.18176884949207306, "rewards/accuracy_reward_step": 0.8515625, "rewards/final_brier_reward_step": 0.8250710964202881, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.5145614743232727, "step": 54 }, { "adv/mean_abs_final_conf": 0.7537381649017334, "adv/mean_abs_reasoning": 0.3317815065383911, "adv/mean_abs_step_conf": 0.7744541168212891, "adv/ratio_final_to_reasoning": 2.271790772083063, "adv/ratio_step_to_reasoning": 2.3342293092266595, "adv/std_final_conf": 0.9304205775260925, "adv/std_reasoning": 0.6186366081237793, "adv/std_step_conf": 0.9362649321556091, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.35546875, "calib/ece": 0.14776892430278885, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.04838398714687364, "calib/mean_conf": 0.6510358565737052, "calib/mu_c": 0.6323376623376623, "calib/mu_w": 0.680721649484536, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09262948207171313, "calib/std_conf": 0.05757830895843759, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5705675146771038, "calib/step_q_c_n": 1533.0, "calib/step_q_gap": -0.04119935648854045, "calib/step_q_w": 0.6117668711656442, "calib/step_q_w_n": 1630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 745.17578125, "completions/mean_terminated_length": 748.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.058666666666666666, "grad_norm": 1.5653352737426758, "kl": 0.0928497314453125, "learning_rate": 4.055555555555556e-06, "loss": -0.0394, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.022620702162384987, "mask/share_reasoning": 0.7736297845840454, "mask/share_step_conf": 0.19984331727027893, "num_tokens": 16428362.0, "reward": 0.6656664609909058, "reward_std": 0.23249629139900208, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7208675742149353, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.29405906796455383, "step": 55 }, { "adv/mean_abs_final_conf": 0.7665127515792847, "adv/mean_abs_reasoning": 0.39497989416122437, "adv/mean_abs_step_conf": 0.7779443264007568, "adv/ratio_final_to_reasoning": 1.94063739170077, "adv/ratio_step_to_reasoning": 1.9695795606325537, "adv/std_final_conf": 0.9304113984107971, "adv/std_reasoning": 0.6612225770950317, "adv/std_step_conf": 0.9364283084869385, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.171875, "calib/ece": 0.06500000000000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.006969597907813041, "calib/mean_conf": 0.64859375, "calib/mu_c": 0.6511801242236025, "calib/mu_w": 0.6442105263157895, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.042343750000000034, "calib/std_conf": 0.06418034131209883, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5809301014656144, "calib/step_q_c_n": 1774.0, "calib/step_q_gap": 0.0017127902317285448, "calib/step_q_w": 0.5792173112338859, "calib/step_q_w_n": 1086.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 747.08984375, "completions/mean_terminated_length": 752.972412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.05973333333333333, "grad_norm": 0.21732014417648315, "kl": 0.08220672607421875, "learning_rate": 4.027777777777779e-06, "loss": 0.0511, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022660259157419205, "mask/share_reasoning": 0.7831286787986755, "mask/share_step_conf": 0.18639856576919556, "num_tokens": 16726457.0, "reward": 0.707023024559021, "reward_std": 0.22808465361595154, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7653632760047913, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.32290148735046387, "step": 56 }, { "adv/mean_abs_final_conf": 0.7563657760620117, "adv/mean_abs_reasoning": 0.25031182169914246, "adv/mean_abs_step_conf": 0.7859717011451721, "adv/ratio_final_to_reasoning": 3.02169418498784, "adv/ratio_step_to_reasoning": 3.1399703610077827, "adv/std_final_conf": 0.9285401701927185, "adv/std_reasoning": 0.5483488440513611, "adv/std_step_conf": 0.9359719157218933, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 12.4140625, "calib/ece": 0.21637096774193545, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.004032258064516129, "calib/gap": -0.03967926689576173, "calib/mean_conf": 0.6545161290322581, "calib/mu_c": 0.6458762886597939, "calib/mu_w": 0.6855555555555556, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04431451612903227, "calib/std_conf": 0.05451923068099795, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5745685997171146, "calib/step_q_c_n": 2121.0, "calib/step_q_gap": -0.05970765383066223, "calib/step_q_w": 0.6342762535477768, "calib/step_q_w_n": 1057.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 727.50390625, "completions/mean_terminated_length": 741.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.0608, "grad_norm": 0.1882893294095993, "kl": 0.0820465087890625, "learning_rate": 4.000000000000001e-06, "loss": -0.0913, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.021145198494195938, "mask/share_reasoning": 0.762434720993042, "mask/share_step_conf": 0.19688883423805237, "num_tokens": 17019490.0, "reward": 0.7441555261611938, "reward_std": 0.19160419702529907, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7693679332733154, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.37441182136535645, "step": 57 }, { "adv/mean_abs_final_conf": 0.7255205512046814, "adv/mean_abs_reasoning": 0.3990098834037781, "adv/mean_abs_step_conf": 0.7807788252830505, "adv/ratio_final_to_reasoning": 1.8183022059894463, "adv/ratio_step_to_reasoning": 1.9567906905527483, "adv/std_final_conf": 0.9329466819763184, "adv/std_reasoning": 0.7013241052627563, "adv/std_step_conf": 0.9364590048789978, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.99609375, "calib/ece": 0.0705179282868526, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0199203187250996, "calib/gap": 0.015800388403384713, "calib/mean_conf": 0.6770517928286852, "calib/mu_c": 0.6826543209876543, "calib/mu_w": 0.6668539325842696, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.051075697211155374, "calib/std_conf": 0.07998053365487154, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6020530726256983, "calib/step_q_c_n": 2148.0, "calib/step_q_gap": -0.02250441866350028, "calib/step_q_w": 0.6245574912891986, "calib/step_q_w_n": 1435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 863.31640625, "completions/mean_terminated_length": 877.0198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.06186666666666667, "grad_norm": 0.19960562884807587, "kl": 0.069580078125, "learning_rate": 3.972222222222223e-06, "loss": -0.0834, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019993755966424942, "mask/share_reasoning": 0.7693009972572327, "mask/share_step_conf": 0.1950802505016327, "num_tokens": 17346819.0, "reward": 0.6891767978668213, "reward_std": 0.24477124214172363, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7559226751327515, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.299774706363678, "step": 58 }, { "adv/mean_abs_final_conf": 0.753639817237854, "adv/mean_abs_reasoning": 0.40339523553848267, "adv/mean_abs_step_conf": 0.7527267932891846, "adv/ratio_final_to_reasoning": 1.8682417412090606, "adv/ratio_step_to_reasoning": 1.8659783928394384, "adv/std_final_conf": 0.9298037886619568, "adv/std_reasoning": 0.6815541982650757, "adv/std_step_conf": 0.9364940524101257, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.56640625, "calib/ece": 0.04047244094488186, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007023617820719141, "calib/mean_conf": 0.6483464566929135, "calib/mu_c": 0.6458024691358025, "calib/mu_w": 0.6528260869565217, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025511811023622, "calib/std_conf": 0.05279232495409867, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5748107798165137, "calib/step_q_c_n": 1744.0, "calib/step_q_gap": -0.01329111007666628, "calib/step_q_w": 0.58810188989318, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 761.578125, "completions/mean_terminated_length": 767.5748291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.06293333333333333, "grad_norm": 0.5024107098579407, "kl": 0.0820159912109375, "learning_rate": 3.944444444444445e-06, "loss": 0.0103, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02243354171514511, "mask/share_reasoning": 0.7792695760726929, "mask/share_step_conf": 0.1904844045639038, "num_tokens": 17648031.0, "reward": 0.6598141193389893, "reward_std": 0.2476826310157776, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7568843960762024, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.2377437800168991, "step": 59 }, { "adv/mean_abs_final_conf": 0.7591273188591003, "adv/mean_abs_reasoning": 0.5218755006790161, "adv/mean_abs_step_conf": 0.775875449180603, "adv/ratio_final_to_reasoning": 1.4546138262313408, "adv/ratio_step_to_reasoning": 1.4867060212083258, "adv/std_final_conf": 0.9318666458129883, "adv/std_reasoning": 0.7754560112953186, "adv/std_step_conf": 0.9364181756973267, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 11.859375, "calib/ece": 0.09024896265560166, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.004149377593360996, "calib/gap": -0.018300289766661537, "calib/mean_conf": 0.6528215767634854, "calib/mu_c": 0.6465189873417722, "calib/mu_w": 0.6648192771084337, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.043734439834024905, "calib/std_conf": 0.05413527423211244, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.57798463356974, "calib/step_q_c_n": 1692.0, "calib/step_q_gap": -0.02137548547787904, "calib/step_q_w": 0.599360119047619, "calib/step_q_w_n": 1344.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 780.3125, "completions/mean_terminated_length": 792.698486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 318.0, "epoch": 0.064, "grad_norm": 0.47881805896759033, "kl": 0.07741546630859375, "learning_rate": 3.916666666666667e-06, "loss": -0.1225, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.020657043904066086, "mask/share_reasoning": 0.7711943984031677, "mask/share_step_conf": 0.1925235241651535, "num_tokens": 17956647.0, "reward": 0.619196891784668, "reward_std": 0.2881484627723694, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7128839492797852, "rewards/format_reward_step": 0.93359375, "rewards/step_correlation_reward": 0.21379104256629944, "step": 60 }, { "adv/mean_abs_final_conf": 0.7319515347480774, "adv/mean_abs_reasoning": 0.2323523759841919, "adv/mean_abs_step_conf": 0.759729266166687, "adv/ratio_final_to_reasoning": 3.1501788249321616, "adv/ratio_step_to_reasoning": 3.2697288458904126, "adv/std_final_conf": 0.9271352291107178, "adv/std_reasoning": 0.548259437084198, "adv/std_step_conf": 0.9357583522796631, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 10.5859375, "calib/ece": 0.14762096774193548, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01094119893088974, "calib/mean_conf": 0.6399596774193548, "calib/mu_c": 0.6375773195876289, "calib/mu_w": 0.6485185185185186, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.002661290322580646, "calib/std_conf": 0.05387783361211411, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5723488943488944, "calib/step_q_c_n": 2035.0, "calib/step_q_gap": -0.018080735280735283, "calib/step_q_w": 0.5904296296296296, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 677.70703125, "completions/mean_terminated_length": 685.7431030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.06506666666666666, "grad_norm": 0.6927701234817505, "kl": 0.0820159912109375, "learning_rate": 3.88888888888889e-06, "loss": -0.0894, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.024425944313406944, "mask/share_reasoning": 0.7670689821243286, "mask/share_step_conf": 0.1967863142490387, "num_tokens": 18234204.0, "reward": 0.7487568259239197, "reward_std": 0.17801451683044434, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.7777035236358643, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.3713727295398712, "step": 61 }, { "adv/mean_abs_final_conf": 0.5855187773704529, "adv/mean_abs_reasoning": 0.6621418595314026, "adv/mean_abs_step_conf": 0.5837773680686951, "adv/ratio_final_to_reasoning": 0.8842799604677224, "adv/ratio_step_to_reasoning": 0.8816499963947816, "adv/std_final_conf": 0.8104560971260071, "adv/std_reasoning": 0.8591093420982361, "adv/std_step_conf": 0.8111568093299866, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 11.80859375, "calib/ece": 0.07259259259259261, "calib/final_conf_rate": 0.52734375, "calib/format_rate": 0.52734375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.011274001037882786, "calib/mean_conf": 0.6237037037037039, "calib/mu_c": 0.6271276595744681, "calib/mu_w": 0.6158536585365854, "calib/nonempty_final_conf_rate": 0.52734375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.039266246757625405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5861579892280072, "calib/step_q_c_n": 2228.0, "calib/step_q_gap": -0.006156476180797843, "calib/step_q_w": 0.5923144654088051, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 800.28515625, "completions/mean_terminated_length": 806.5866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.06613333333333334, "grad_norm": 10.070162773132324, "kl": 0.0822296142578125, "learning_rate": 3.861111111111112e-06, "loss": -0.4774, "mask/has_final_conf_rate": 0.52734375, "mask/share_final_conf": 0.014446118846535683, "mask/share_reasoning": 0.7766348123550415, "mask/share_step_conf": 0.20110660791397095, "num_tokens": 18546157.0, "reward": 0.4277394413948059, "reward_std": 0.26725536584854126, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.41474997997283936, "rewards/format_reward_step": 0.52734375, "rewards/step_correlation_reward": 0.18838509917259216, "step": 62 }, { "adv/mean_abs_final_conf": 0.7331758737564087, "adv/mean_abs_reasoning": 0.4314548075199127, "adv/mean_abs_step_conf": 0.7121888399124146, "adv/ratio_final_to_reasoning": 1.699310938197324, "adv/ratio_step_to_reasoning": 1.6506684535657776, "adv/std_final_conf": 0.9166691303253174, "adv/std_reasoning": 0.7014430165290833, "adv/std_step_conf": 0.9218115210533142, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.625, "calib/ece": 0.08731818181818182, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.85546875, "calib/frac_conf_gt_0.9": 0.004545454545454545, "calib/gap": -0.014696969696969653, "calib/mean_conf": 0.6498636363636363, "calib/mu_c": 0.6454545454545455, "calib/mu_w": 0.6601515151515152, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.018590909090909064, "calib/std_conf": 0.05784367291128537, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5940733197556007, "calib/step_q_c_n": 1964.0, "calib/step_q_gap": -0.0131191092664813, "calib/step_q_w": 0.607192429022082, "calib/step_q_w_n": 1268.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 888.6328125, "completions/mean_terminated_length": 888.6328125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.0672, "grad_norm": 0.47227707505226135, "kl": 0.07555389404296875, "learning_rate": 3.833333333333334e-06, "loss": -0.0276, "mask/has_final_conf_rate": 0.859375, "mask/share_final_conf": 0.01991172507405281, "mask/share_reasoning": 0.7896748781204224, "mask/share_step_conf": 0.19041332602500916, "num_tokens": 18882287.0, "reward": 0.6191070675849915, "reward_std": 0.24419625103473663, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6652539372444153, "rewards/format_reward_step": 0.85546875, "rewards/step_correlation_reward": 0.26905399560928345, "step": 63 }, { "adv/mean_abs_final_conf": 0.7193543314933777, "adv/mean_abs_reasoning": 0.2724745273590088, "adv/mean_abs_step_conf": 0.7900326251983643, "adv/ratio_final_to_reasoning": 2.640079197368663, "adv/ratio_step_to_reasoning": 2.899473330060787, "adv/std_final_conf": 0.914580225944519, "adv/std_reasoning": 0.5726332068443298, "adv/std_step_conf": 0.9362464547157288, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.85546875, "calib/ece": 0.1372047244094489, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": 0.0004552708157326002, "calib/mean_conf": 0.6541338582677164, "calib/mu_c": 0.654228855721393, "calib/mu_w": 0.6537735849056604, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.06036501335931002, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5806550618415026, "calib/step_q_c_n": 2183.0, "calib/step_q_gap": -0.03269000858103266, "calib/step_q_w": 0.6133450704225353, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 726.25390625, "completions/mean_terminated_length": 734.8656616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.06826666666666667, "grad_norm": 0.6560176014900208, "kl": 0.08819580078125, "learning_rate": 3.8055555555555556e-06, "loss": 0.0339, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.022718653082847595, "mask/share_reasoning": 0.7660689353942871, "mask/share_step_conf": 0.1994936764240265, "num_tokens": 19171984.0, "reward": 0.7354456186294556, "reward_std": 0.21538183093070984, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.8062112927436829, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3092111051082611, "step": 64 }, { "adv/mean_abs_final_conf": 0.7510883808135986, "adv/mean_abs_reasoning": 0.25280725955963135, "adv/mean_abs_step_conf": 0.7716500163078308, "adv/ratio_final_to_reasoning": 2.9709921389201024, "adv/ratio_step_to_reasoning": 3.052325386747118, "adv/std_final_conf": 0.9271266460418701, "adv/std_reasoning": 0.5481659770011902, "adv/std_step_conf": 0.9363374710083008, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.94921875, "calib/ece": 0.04152941176470584, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.010342105263158041, "calib/mean_conf": 0.6436470588235293, "calib/mu_c": 0.6475, "calib/mu_w": 0.6371578947368419, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028862745098039176, "calib/std_conf": 0.05433766018958576, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5741564506713368, "calib/step_q_c_n": 1713.0, "calib/step_q_gap": -0.01048575116352557, "calib/step_q_w": 0.5846422018348624, "calib/step_q_w_n": 1090.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 679.70703125, "completions/mean_terminated_length": 685.05908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.06933333333333333, "grad_norm": 0.21009093523025513, "kl": 0.0871734619140625, "learning_rate": 3.777777777777778e-06, "loss": -0.01, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.024044401943683624, "mask/share_reasoning": 0.7692995071411133, "mask/share_step_conf": 0.19884361326694489, "num_tokens": 19451013.0, "reward": 0.6475369334220886, "reward_std": 0.20800790190696716, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.764864444732666, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.20599065721035004, "step": 65 }, { "adv/mean_abs_final_conf": 0.7202534079551697, "adv/mean_abs_reasoning": 0.4160011112689972, "adv/mean_abs_step_conf": 0.7719686627388, "adv/ratio_final_to_reasoning": 1.7313737594547793, "adv/ratio_step_to_reasoning": 1.855688943675982, "adv/std_final_conf": 0.9306468367576599, "adv/std_reasoning": 0.7012670636177063, "adv/std_step_conf": 0.936653196811676, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.515625, "calib/ece": 0.18390438247011956, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.00398406374501992, "calib/gap": 0.008322925958965444, "calib/mean_conf": 0.6666135458167332, "calib/mu_c": 0.6705263157894737, "calib/mu_w": 0.6622033898305083, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16031872509960163, "calib/std_conf": 0.06852520151598002, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6015303119482048, "calib/step_q_c_n": 1699.0, "calib/step_q_gap": 0.006436615185002048, "calib/step_q_w": 0.5950936967632028, "calib/step_q_w_n": 1761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 812.390625, "completions/mean_terminated_length": 825.2857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0704, "grad_norm": 0.2491704523563385, "kl": 0.0764312744140625, "learning_rate": 3.7500000000000005e-06, "loss": -0.0821, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020575175061821938, "mask/share_reasoning": 0.7718431353569031, "mask/share_step_conf": 0.19195665419101715, "num_tokens": 19765337.0, "reward": 0.5751186609268188, "reward_std": 0.24441185593605042, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7152577638626099, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.13576076924800873, "step": 66 }, { "adv/mean_abs_final_conf": 0.7632363438606262, "adv/mean_abs_reasoning": 0.31024909019470215, "adv/mean_abs_step_conf": 0.792525053024292, "adv/ratio_final_to_reasoning": 2.4600760098334025, "adv/ratio_step_to_reasoning": 2.5544798617360303, "adv/std_final_conf": 0.9284982085227966, "adv/std_reasoning": 0.5960694551467896, "adv/std_step_conf": 0.9355476498603821, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.21484375, "calib/ece": 0.13881422924901188, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0411336705202312, "calib/mean_conf": 0.6584980237154151, "calib/mu_c": 0.6454913294797688, "calib/mu_w": 0.686625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0567588932806324, "calib/std_conf": 0.06758479519499076, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5716897506925208, "calib/step_q_c_n": 1805.0, "calib/step_q_gap": -0.03887757154651106, "calib/step_q_w": 0.6105673222390319, "calib/step_q_w_n": 1322.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 775.9296875, "completions/mean_terminated_length": 785.1304931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.07146666666666666, "grad_norm": 1.5712337493896484, "kl": 0.0920257568359375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0613, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.022031016647815704, "mask/share_reasoning": 0.7744245529174805, "mask/share_step_conf": 0.1918257176876068, "num_tokens": 20068983.0, "reward": 0.7031233310699463, "reward_std": 0.20975123345851898, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7481851577758789, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3268115818500519, "step": 67 }, { "adv/mean_abs_final_conf": 0.7644180059432983, "adv/mean_abs_reasoning": 0.3880755305290222, "adv/mean_abs_step_conf": 0.7951478958129883, "adv/ratio_final_to_reasoning": 1.9697660527610394, "adv/ratio_step_to_reasoning": 2.048951385131259, "adv/std_final_conf": 0.9311574697494507, "adv/std_reasoning": 0.6612011790275574, "adv/std_step_conf": 0.9362030029296875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.8125, "calib/ece": 0.12948616600790516, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": -0.048499346405228905, "calib/mean_conf": 0.6574703557312254, "calib/mu_c": 0.6383006535947712, "calib/mu_w": 0.6868000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0911067193675889, "calib/std_conf": 0.06495303951749451, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5713325330132053, "calib/step_q_c_n": 1666.0, "calib/step_q_gap": -0.04335767764354803, "calib/step_q_w": 0.6146902106567533, "calib/step_q_w_n": 1614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 727.9375, "completions/mean_terminated_length": 739.4921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.07253333333333334, "grad_norm": 0.2425469011068344, "kl": 0.08228302001953125, "learning_rate": 3.694444444444445e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.022897986695170403, "mask/share_reasoning": 0.7602324485778809, "mask/share_step_conf": 0.2012445628643036, "num_tokens": 20359423.0, "reward": 0.6331701874732971, "reward_std": 0.21751436591148376, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.722222626209259, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.22693029046058655, "step": 68 }, { "adv/mean_abs_final_conf": 0.7844705581665039, "adv/mean_abs_reasoning": 0.5006694793701172, "adv/mean_abs_step_conf": 0.7850892543792725, "adv/ratio_final_to_reasoning": 1.5668431779652945, "adv/ratio_step_to_reasoning": 1.5680789157888722, "adv/std_final_conf": 0.9311051368713379, "adv/std_reasoning": 0.7394245862960815, "adv/std_step_conf": 0.9362419247627258, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.64453125, "calib/ece": 0.11515999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.027525703463203488, "calib/mean_conf": 0.66044, "calib/mu_c": 0.6498701298701298, "calib/mu_w": 0.6773958333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07979999999999998, "calib/std_conf": 0.06698810640703319, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5879117496151873, "calib/step_q_c_n": 1949.0, "calib/step_q_gap": -0.02638876851952776, "calib/step_q_w": 0.6143005181347151, "calib/step_q_w_n": 1544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 813.2265625, "completions/mean_terminated_length": 829.4263305664062, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.0736, "grad_norm": 1.063038945198059, "kl": 0.075531005859375, "learning_rate": 3.6666666666666666e-06, "loss": -0.1206, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019905662164092064, "mask/share_reasoning": 0.7785724401473999, "mask/share_step_conf": 0.1819905936717987, "num_tokens": 20672105.0, "reward": 0.6814106702804565, "reward_std": 0.23180070519447327, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7257925868034363, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3221849501132965, "step": 69 }, { "adv/mean_abs_final_conf": 0.7228235006332397, "adv/mean_abs_reasoning": 0.33797919750213623, "adv/mean_abs_step_conf": 0.7491491436958313, "adv/ratio_final_to_reasoning": 2.1386626927791053, "adv/ratio_step_to_reasoning": 2.2165540046028904, "adv/std_final_conf": 0.9298471212387085, "adv/std_reasoning": 0.661091148853302, "adv/std_step_conf": 0.9359894394874573, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 12.90625, "calib/ece": 0.08349397590361451, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.03341465062846338, "calib/mean_conf": 0.6573895582329318, "calib/mu_c": 0.6442384105960265, "calib/mu_w": 0.6776530612244899, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06722891566265059, "calib/std_conf": 0.06152686463401489, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5746817370612731, "calib/step_q_c_n": 1681.0, "calib/step_q_gap": -0.0379861618912839, "calib/step_q_w": 0.612667898952557, "calib/step_q_w_n": 1623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 781.09765625, "completions/mean_terminated_length": 793.49609375, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.07466666666666667, "grad_norm": 3.1837499141693115, "kl": 0.10294342041015625, "learning_rate": 3.638888888888889e-06, "loss": -0.1147, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021590005606412888, "mask/share_reasoning": 0.7688941955566406, "mask/share_step_conf": 0.19389083981513977, "num_tokens": 20979058.0, "reward": 0.6882272362709045, "reward_std": 0.1804431676864624, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.718786358833313, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.34516817331314087, "step": 70 }, { "adv/mean_abs_final_conf": 0.7644456624984741, "adv/mean_abs_reasoning": 0.5488430261611938, "adv/mean_abs_step_conf": 0.7760787010192871, "adv/ratio_final_to_reasoning": 1.3928311485440252, "adv/ratio_step_to_reasoning": 1.4140267144277328, "adv/std_final_conf": 0.9317163228988647, "adv/std_reasoning": 0.7927515506744385, "adv/std_step_conf": 0.9363721013069153, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.6953125, "calib/ece": 0.06669322709163347, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": -0.014381833473507077, "calib/mean_conf": 0.6547410358565737, "calib/mu_c": 0.6497560975609756, "calib/mu_w": 0.6641379310344827, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03402390438247012, "calib/std_conf": 0.07459473955077608, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5889476284584981, "calib/step_q_c_n": 2024.0, "calib/step_q_gap": -0.013124149681795538, "calib/step_q_w": 0.6020717781402937, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 742.265625, "completions/mean_terminated_length": 757.0518188476562, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.07573333333333333, "grad_norm": 0.25131160020828247, "kl": 0.07721710205078125, "learning_rate": 3.6111111111111115e-06, "loss": -0.0556, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.023128561675548553, "mask/share_reasoning": 0.7649872899055481, "mask/share_step_conf": 0.19235289096832275, "num_tokens": 21273486.0, "reward": 0.6729413866996765, "reward_std": 0.25942322611808777, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7465749979019165, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2750890254974365, "step": 71 }, { "adv/mean_abs_final_conf": 0.7536088228225708, "adv/mean_abs_reasoning": 0.30147671699523926, "adv/mean_abs_step_conf": 0.7713176608085632, "adv/ratio_final_to_reasoning": 2.4997247891434062, "adv/ratio_step_to_reasoning": 2.5584651063476436, "adv/std_final_conf": 0.9291418790817261, "adv/std_reasoning": 0.5959498882293701, "adv/std_step_conf": 0.9357434511184692, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.75390625, "calib/ece": 0.08629921259842521, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.016833333333333256, "calib/mean_conf": 0.6575590551181102, "calib/mu_c": 0.6506666666666667, "calib/mu_w": 0.6675, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07665354330708662, "calib/std_conf": 0.051979955789050146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5783435582822085, "calib/step_q_c_n": 1793.0, "calib/step_q_gap": -0.014781441717791388, "calib/step_q_w": 0.5931249999999999, "calib/step_q_w_n": 1472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 698.6796875, "completions/mean_terminated_length": 704.1810913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.0768, "grad_norm": 0.2750079035758972, "kl": 0.0861053466796875, "learning_rate": 3.5833333333333335e-06, "loss": 0.0094, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02209099940955639, "mask/share_reasoning": 0.7602138519287109, "mask/share_step_conf": 0.20988258719444275, "num_tokens": 21556756.0, "reward": 0.6741582155227661, "reward_std": 0.19570280611515045, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7370632886886597, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.295628160238266, "step": 72 }, { "adv/mean_abs_final_conf": 0.7786300182342529, "adv/mean_abs_reasoning": 0.35900551080703735, "adv/mean_abs_step_conf": 0.7729635238647461, "adv/ratio_final_to_reasoning": 2.1688525518282655, "adv/ratio_step_to_reasoning": 2.153068687238642, "adv/std_final_conf": 0.9293910264968872, "adv/std_reasoning": 0.6186598539352417, "adv/std_step_conf": 0.9356748461723328, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.515625, "calib/ece": 0.14157480314960633, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.002855339105339083, "calib/mean_conf": 0.6379527559055118, "calib/mu_c": 0.6373232323232323, "calib/mu_w": 0.6401785714285714, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.04520885187306327, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5724254742547426, "calib/step_q_c_n": 2214.0, "calib/step_q_gap": -0.01903229141283236, "calib/step_q_w": 0.5914577656675749, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 689.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.07786666666666667, "grad_norm": 0.20970845222473145, "kl": 0.08272552490234375, "learning_rate": 3.555555555555556e-06, "loss": 0.0009, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02301781065762043, "mask/share_reasoning": 0.7797709703445435, "mask/share_step_conf": 0.18939867615699768, "num_tokens": 21838796.0, "reward": 0.7931956648826599, "reward_std": 0.18731746077537537, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.7987773418426514, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.43448901176452637, "step": 73 }, { "adv/mean_abs_final_conf": 0.7864284515380859, "adv/mean_abs_reasoning": 0.38745418190956116, "adv/mean_abs_step_conf": 0.7657153606414795, "adv/ratio_final_to_reasoning": 2.029732774239749, "adv/ratio_step_to_reasoning": 1.9762733153831629, "adv/std_final_conf": 0.9295899271965027, "adv/std_reasoning": 0.6403974890708923, "adv/std_step_conf": 0.9360736012458801, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.75, "calib/ece": 0.16272000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03280922431865829, "calib/mean_conf": 0.6427999999999999, "calib/mu_c": 0.6288888888888888, "calib/mu_w": 0.6616981132075471, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11475999999999999, "calib/std_conf": 0.05854365892220951, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5696430931923331, "calib/step_q_c_n": 1513.0, "calib/step_q_gap": -0.044154736619203216, "calib/step_q_w": 0.6137978298115363, "calib/step_q_w_n": 1751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 679.0703125, "completions/mean_terminated_length": 695.3680419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.07893333333333333, "grad_norm": 0.24181662499904633, "kl": 0.09625244140625, "learning_rate": 3.5277777777777784e-06, "loss": -0.0788, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02417697384953499, "mask/share_reasoning": 0.7582485675811768, "mask/share_step_conf": 0.19413697719573975, "num_tokens": 22116566.0, "reward": 0.6208184361457825, "reward_std": 0.21420027315616608, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7147077918052673, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.21911655366420746, "step": 74 }, { "adv/mean_abs_final_conf": 0.7937130928039551, "adv/mean_abs_reasoning": 0.3079615533351898, "adv/mean_abs_step_conf": 0.7679805755615234, "adv/ratio_final_to_reasoning": 2.5773122787832747, "adv/ratio_step_to_reasoning": 2.4937547146531056, "adv/std_final_conf": 0.926730215549469, "adv/std_reasoning": 0.5726933479309082, "adv/std_step_conf": 0.9360403418540955, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.41796875, "calib/ece": 0.20246093749999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02472130546387974, "calib/mean_conf": 0.6330859375, "calib/mu_c": 0.6278712871287129, "calib/mu_w": 0.6525925925925926, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023242187499999983, "calib/std_conf": 0.04643805136680581, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5746615523465706, "calib/step_q_c_n": 2216.0, "calib/step_q_gap": -0.007813695178181868, "calib/step_q_w": 0.5824752475247524, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 643.375, "completions/mean_terminated_length": 648.44091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.08, "grad_norm": 0.1915205866098404, "kl": 0.0908966064453125, "learning_rate": 3.5e-06, "loss": 0.0015, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.024786923080682755, "mask/share_reasoning": 0.7751480340957642, "mask/share_step_conf": 0.192252516746521, "num_tokens": 22386022.0, "reward": 0.7966657876968384, "reward_std": 0.18921825289726257, "rewards/accuracy_reward_step": 0.7890625, "rewards/final_brier_reward_step": 0.7988425493240356, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.4366765022277832, "step": 75 }, { "adv/mean_abs_final_conf": 0.751955509185791, "adv/mean_abs_reasoning": 0.3741779029369354, "adv/mean_abs_step_conf": 0.7689213156700134, "adv/ratio_final_to_reasoning": 2.0096202990173015, "adv/ratio_step_to_reasoning": 2.0549618500577482, "adv/std_final_conf": 0.9292881488800049, "adv/std_reasoning": 0.6612319350242615, "adv/std_step_conf": 0.9360610842704773, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.3515625, "calib/ece": 0.06920634920634912, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.019063157894736893, "calib/mean_conf": 0.6423015873015874, "calib/mu_c": 0.6347368421052632, "calib/mu_w": 0.6538, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05416666666666666, "calib/std_conf": 0.05576541993899569, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5709497882637629, "calib/step_q_c_n": 1653.0, "calib/step_q_gap": -0.03404027137838428, "calib/step_q_w": 0.6049900596421471, "calib/step_q_w_n": 1509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 668.32421875, "completions/mean_terminated_length": 681.637451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.08106666666666666, "grad_norm": 0.19638362526893616, "kl": 0.0817718505859375, "learning_rate": 3.4722222222222224e-06, "loss": -0.0522, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.024196840822696686, "mask/share_reasoning": 0.7593661546707153, "mask/share_step_conf": 0.196905717253685, "num_tokens": 22660169.0, "reward": 0.6167426109313965, "reward_std": 0.22435541450977325, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7352085709571838, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1826515942811966, "step": 76 }, { "adv/mean_abs_final_conf": 0.7270142436027527, "adv/mean_abs_reasoning": 0.47899097204208374, "adv/mean_abs_step_conf": 0.7548226118087769, "adv/ratio_final_to_reasoning": 1.5178036456580184, "adv/ratio_step_to_reasoning": 1.5758597883186383, "adv/std_final_conf": 0.9317358136177063, "adv/std_reasoning": 0.7575597763061523, "adv/std_step_conf": 0.9361504316329956, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.13671875, "calib/ece": 0.09701195219123503, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00398406374501992, "calib/gap": -0.018355166323226202, "calib/mean_conf": 0.6479282868525896, "calib/mu_c": 0.6421511627906977, "calib/mu_w": 0.6605063291139239, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02984063745019921, "calib/std_conf": 0.06739083571007691, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5808516746411483, "calib/step_q_c_n": 2090.0, "calib/step_q_gap": -0.040012425908733884, "calib/step_q_w": 0.6208641005498822, "calib/step_q_w_n": 1273.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 678.09375, "completions/mean_terminated_length": 691.6016235351562, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.08213333333333334, "grad_norm": 0.7611878514289856, "kl": 0.1143951416015625, "learning_rate": 3.444444444444445e-06, "loss": -0.0598, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.025160975754261017, "mask/share_reasoning": 0.7508993744850159, "mask/share_step_conf": 0.2044084072113037, "num_tokens": 22938425.0, "reward": 0.7108245491981506, "reward_std": 0.25719261169433594, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.755419909954071, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3357604146003723, "step": 77 }, { "adv/mean_abs_final_conf": 0.7609703540802002, "adv/mean_abs_reasoning": 0.4544995129108429, "adv/mean_abs_step_conf": 0.7707890272140503, "adv/ratio_final_to_reasoning": 1.6743040035545127, "adv/ratio_step_to_reasoning": 1.6959072679253948, "adv/std_final_conf": 0.9323300719261169, "adv/std_reasoning": 0.7205326557159424, "adv/std_step_conf": 0.9358113408088684, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.26953125, "calib/ece": 0.0807874015748031, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.015016769865841106, "calib/mean_conf": 0.6534645669291338, "calib/mu_c": 0.6474342105263158, "calib/mu_w": 0.6624509803921569, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0679133858267716, "calib/std_conf": 0.053893712147748914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5778175026680896, "calib/step_q_c_n": 1874.0, "calib/step_q_gap": -0.020140475007550562, "calib/step_q_w": 0.5979579776756402, "calib/step_q_w_n": 1523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 776.51171875, "completions/mean_terminated_length": 782.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.0832, "grad_norm": 0.24049845337867737, "kl": 0.0765838623046875, "learning_rate": 3.416666666666667e-06, "loss": -0.0022, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020861711353063583, "mask/share_reasoning": 0.7841784358024597, "mask/share_step_conf": 0.1871473640203476, "num_tokens": 23245236.0, "reward": 0.6060810089111328, "reward_std": 0.21829815208911896, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7407039403915405, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.15427061915397644, "step": 78 }, { "adv/mean_abs_final_conf": 0.7370405197143555, "adv/mean_abs_reasoning": 0.42967432737350464, "adv/mean_abs_step_conf": 0.7801761627197266, "adv/ratio_final_to_reasoning": 1.7153468866052717, "adv/ratio_step_to_reasoning": 1.8157383697759066, "adv/std_final_conf": 0.9302157163619995, "adv/std_reasoning": 0.7205436825752258, "adv/std_step_conf": 0.9363252520561218, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.58203125, "calib/ece": 0.029561752988047797, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.01458523119392685, "calib/mean_conf": 0.6455776892430278, "calib/mu_c": 0.6508074534161491, "calib/mu_w": 0.6362222222222222, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016852589641434237, "calib/std_conf": 0.05186525735554406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5753923205342237, "calib/step_q_c_n": 1797.0, "calib/step_q_gap": -0.05129307272420325, "calib/step_q_w": 0.626685393258427, "calib/step_q_w_n": 1424.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 723.109375, "completions/mean_terminated_length": 737.5139770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.08426666666666667, "grad_norm": 0.21749094128608704, "kl": 0.08119964599609375, "learning_rate": 3.3888888888888893e-06, "loss": -0.0451, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.021755652502179146, "mask/share_reasoning": 0.7732414603233337, "mask/share_step_conf": 0.18547168374061584, "num_tokens": 23536728.0, "reward": 0.639240026473999, "reward_std": 0.2535552978515625, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7588882446289062, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.1977168172597885, "step": 79 }, { "adv/mean_abs_final_conf": 0.7694152593612671, "adv/mean_abs_reasoning": 0.3173452615737915, "adv/mean_abs_step_conf": 0.7661705017089844, "adv/ratio_final_to_reasoning": 2.4245367822590187, "adv/ratio_step_to_reasoning": 2.414312089959562, "adv/std_final_conf": 0.9289205074310303, "adv/std_reasoning": 0.5960049033164978, "adv/std_step_conf": 0.9357802271842957, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 11.53125, "calib/ece": 0.10195312500000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0006266706266706557, "calib/mean_conf": 0.638203125, "calib/mu_c": 0.638021978021978, "calib/mu_w": 0.6386486486486487, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014609374999999994, "calib/std_conf": 0.04810362502176292, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5719396551724139, "calib/step_q_c_n": 2088.0, "calib/step_q_gap": -0.0060117337164750095, "calib/step_q_w": 0.5779513888888889, "calib/step_q_w_n": 864.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 625.4609375, "completions/mean_terminated_length": 630.3858032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.08533333333333333, "grad_norm": 0.32449737191200256, "kl": 0.0968017578125, "learning_rate": 3.3611111111111117e-06, "loss": -0.0326, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02532077580690384, "mask/share_reasoning": 0.7636048197746277, "mask/share_step_conf": 0.20326188206672668, "num_tokens": 23799006.0, "reward": 0.6887508630752563, "reward_std": 0.20290717482566833, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.786632776260376, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.2486814260482788, "step": 80 }, { "adv/mean_abs_final_conf": 0.7590487003326416, "adv/mean_abs_reasoning": 0.3880404531955719, "adv/mean_abs_step_conf": 0.7771025896072388, "adv/ratio_final_to_reasoning": 1.9561071380103816, "adv/ratio_step_to_reasoning": 2.002632929653806, "adv/std_final_conf": 0.9300270676612854, "adv/std_reasoning": 0.6612900495529175, "adv/std_step_conf": 0.9357220530509949, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.39453125, "calib/ece": 0.13869565217391305, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.023583829365079412, "calib/mean_conf": 0.6466007905138341, "calib/mu_c": 0.6406349206349206, "calib/mu_w": 0.66421875, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.019130434782608677, "calib/std_conf": 0.049681860594208434, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5843677204658903, "calib/step_q_c_n": 2404.0, "calib/step_q_gap": -0.015905450265817023, "calib/step_q_w": 0.6002731707317073, "calib/step_q_w_n": 1025.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 733.7890625, "completions/mean_terminated_length": 739.5669555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.0864, "grad_norm": 0.23874109983444214, "kl": 0.078399658203125, "learning_rate": 3.3333333333333333e-06, "loss": 0.0096, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02268517017364502, "mask/share_reasoning": 0.7729635834693909, "mask/share_step_conf": 0.19653871655464172, "num_tokens": 24093104.0, "reward": 0.7508682012557983, "reward_std": 0.2056404948234558, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7783128619194031, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3788922429084778, "step": 81 }, { "adv/mean_abs_final_conf": 0.7448205947875977, "adv/mean_abs_reasoning": 0.41496533155441284, "adv/mean_abs_step_conf": 0.7517217397689819, "adv/ratio_final_to_reasoning": 1.7948983641539031, "adv/ratio_step_to_reasoning": 1.8115290184679236, "adv/std_final_conf": 0.9288557171821594, "adv/std_reasoning": 0.7013262510299683, "adv/std_step_conf": 0.9361816644668579, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.83984375, "calib/ece": 0.05484000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.016202805499336947, "calib/mean_conf": 0.63788, "calib/mu_c": 0.6321118012422361, "calib/mu_w": 0.648314606741573, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02436000000000001, "calib/std_conf": 0.041932154726414914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5694038668098819, "calib/step_q_c_n": 1862.0, "calib/step_q_gap": -0.03523472968134622, "calib/step_q_w": 0.6046385964912281, "calib/step_q_w_n": 1425.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 647.83203125, "completions/mean_terminated_length": 660.737060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.08746666666666666, "grad_norm": 8.346512794494629, "kl": 0.209014892578125, "learning_rate": 3.3055555555555558e-06, "loss": -0.0679, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02369321510195732, "mask/share_reasoning": 0.7597755193710327, "mask/share_step_conf": 0.19700007140636444, "num_tokens": 24364501.0, "reward": 0.6778282523155212, "reward_std": 0.2450428605079651, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7436628937721252, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2908998727798462, "step": 82 }, { "adv/mean_abs_final_conf": 0.7520098686218262, "adv/mean_abs_reasoning": 0.2951866388320923, "adv/mean_abs_step_conf": 0.7480201721191406, "adv/ratio_final_to_reasoning": 2.547574211343568, "adv/ratio_step_to_reasoning": 2.534058367542267, "adv/std_final_conf": 0.9304251074790955, "adv/std_reasoning": 0.5961121320724487, "adv/std_step_conf": 0.9359679222106934, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.26171875, "calib/ece": 0.13076612903225818, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.029567307692307865, "calib/mean_conf": 0.6555241935483871, "calib/mu_c": 0.643125, "calib/mu_w": 0.6726923076923078, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10282258064516131, "calib/std_conf": 0.05380414957460447, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5741127232142856, "calib/step_q_c_n": 1792.0, "calib/step_q_gap": -0.04678022998636, "calib/step_q_w": 0.6208929532006456, "calib/step_q_w_n": 1859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 789.95703125, "completions/mean_terminated_length": 812.1646118164062, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.08853333333333334, "grad_norm": 0.1789998710155487, "kl": 0.07549285888671875, "learning_rate": 3.277777777777778e-06, "loss": -0.0828, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02037701942026615, "mask/share_reasoning": 0.7650948166847229, "mask/share_step_conf": 0.1871844232082367, "num_tokens": 24673994.0, "reward": 0.6012614965438843, "reward_std": 0.20678496360778809, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7106777429580688, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.18559530377388, "step": 83 }, { "adv/mean_abs_final_conf": 0.7646782398223877, "adv/mean_abs_reasoning": 0.36597129702568054, "adv/mean_abs_step_conf": 0.7656229734420776, "adv/ratio_final_to_reasoning": 2.0894486699833443, "adv/ratio_step_to_reasoning": 2.0920301118269204, "adv/std_final_conf": 0.9289400577545166, "adv/std_reasoning": 0.6403570771217346, "adv/std_step_conf": 0.9360085129737854, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.29296875, "calib/ece": 0.05787148594377517, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.009090157154673206, "calib/mean_conf": 0.6423694779116466, "calib/mu_c": 0.6389743589743591, "calib/mu_w": 0.6480645161290323, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03686746987951806, "calib/std_conf": 0.04789135998877891, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5734943639291465, "calib/step_q_c_n": 1863.0, "calib/step_q_gap": -0.044200441265658696, "calib/step_q_w": 0.6176948051948052, "calib/step_q_w_n": 1540.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 675.76171875, "completions/mean_terminated_length": 691.9800415039062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0896, "grad_norm": 0.24721167981624603, "kl": 0.08000946044921875, "learning_rate": 3.2500000000000002e-06, "loss": -0.0095, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02393614500761032, "mask/share_reasoning": 0.7570117712020874, "mask/share_step_conf": 0.19561460614204407, "num_tokens": 24952909.0, "reward": 0.6713294386863708, "reward_std": 0.21048003435134888, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7384449243545532, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.2878076434135437, "step": 84 }, { "adv/mean_abs_final_conf": 0.7614791393280029, "adv/mean_abs_reasoning": 0.4133550226688385, "adv/mean_abs_step_conf": 0.7623780369758606, "adv/ratio_final_to_reasoning": 1.8421915727828615, "adv/ratio_step_to_reasoning": 1.8443662110443102, "adv/std_final_conf": 0.9300859570503235, "adv/std_reasoning": 0.681785523891449, "adv/std_step_conf": 0.9362416863441467, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 16.1171875, "calib/ece": 0.08950413223140495, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.015131104967747167, "calib/mean_conf": 0.6545454545454544, "calib/mu_c": 0.6502312138728324, "calib/mu_w": 0.6653623188405796, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014586776859504125, "calib/std_conf": 0.05056703267335774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5807871591908531, "calib/step_q_c_n": 2274.0, "calib/step_q_gap": -0.08242558378970843, "calib/step_q_w": 0.6632127429805615, "calib/step_q_w_n": 1852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 736.36328125, "completions/mean_terminated_length": 775.7572021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.09066666666666667, "grad_norm": 0.2139563411474228, "kl": 0.065704345703125, "learning_rate": 3.2222222222222227e-06, "loss": -0.1757, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.020088504999876022, "mask/share_reasoning": 0.7408749461174011, "mask/share_step_conf": 0.18825532495975494, "num_tokens": 25249242.0, "reward": 0.7280115485191345, "reward_std": 0.23207849264144897, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7409422397613525, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.3908621668815613, "step": 85 }, { "adv/mean_abs_final_conf": 0.7666450142860413, "adv/mean_abs_reasoning": 0.47989705204963684, "adv/mean_abs_step_conf": 0.7776232957839966, "adv/ratio_final_to_reasoning": 1.5975197409771658, "adv/ratio_step_to_reasoning": 1.6203960671622655, "adv/std_final_conf": 0.9320396184921265, "adv/std_reasoning": 0.7395176887512207, "adv/std_step_conf": 0.9357236623764038, "calib/answer_extract_rate": 0.91796875, "calib/avg_num_step_conf": 17.3359375, "calib/ece": 0.13842553191489365, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.022039048737521938, "calib/mean_conf": 0.6698297872340425, "calib/mu_c": 0.6600763358778626, "calib/mu_w": 0.6821153846153846, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12540425531914895, "calib/std_conf": 0.06703423042663278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5864088397790056, "calib/step_q_c_n": 1810.0, "calib/step_q_gap": -0.0714184052742668, "calib/step_q_w": 0.6578272450532724, "calib/step_q_w_n": 2628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 797.66015625, "completions/mean_terminated_length": 861.6075439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.09173333333333333, "grad_norm": 0.2806868255138397, "kl": 0.06467437744140625, "learning_rate": 3.1944444444444443e-06, "loss": -0.2478, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.019146300852298737, "mask/share_reasoning": 0.7248205542564392, "mask/share_step_conf": 0.18181441724300385, "num_tokens": 25558955.0, "reward": 0.5457245111465454, "reward_std": 0.25168851017951965, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6658051013946533, "rewards/format_reward_step": 0.91796875, "rewards/step_correlation_reward": 0.13970647752285004, "step": 86 }, { "adv/mean_abs_final_conf": 0.7346113920211792, "adv/mean_abs_reasoning": 0.46212661266326904, "adv/mean_abs_step_conf": 0.7368422746658325, "adv/ratio_final_to_reasoning": 1.5896323039860456, "adv/ratio_step_to_reasoning": 1.5944597313263507, "adv/std_final_conf": 0.9318578243255615, "adv/std_reasoning": 0.7396441102027893, "adv/std_step_conf": 0.9363143444061279, "calib/answer_extract_rate": 0.890625, "calib/avg_num_step_conf": 17.08203125, "calib/ece": 0.14903508771929824, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.009928332537028073, "calib/mean_conf": 0.6514035087719299, "calib/mu_c": 0.6534065934065934, "calib/mu_w": 0.6434782608695653, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0010964912280701754, "calib/std_conf": 0.053702840478021444, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5824095394736842, "calib/step_q_c_n": 2432.0, "calib/step_q_gap": -0.10879087268499688, "calib/step_q_w": 0.6912004121586811, "calib/step_q_w_n": 1941.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 658.2109375, "completions/mean_terminated_length": 735.8165893554688, "completions/min_length": 0.0, "completions/min_terminated_length": 251.0, "epoch": 0.0928, "grad_norm": 0.5631839036941528, "kl": 0.074859619140625, "learning_rate": 3.1666666666666667e-06, "loss": -0.3261, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.020042482763528824, "mask/share_reasoning": 0.6945275068283081, "mask/share_step_conf": 0.17996126413345337, "num_tokens": 25832953.0, "reward": 0.7213019132614136, "reward_std": 0.2892686724662781, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.728265643119812, "rewards/format_reward_step": 0.890625, "rewards/step_correlation_reward": 0.3940257430076599, "step": 87 }, { "adv/mean_abs_final_conf": 0.7458748817443848, "adv/mean_abs_reasoning": 0.4192622900009155, "adv/mean_abs_step_conf": 0.7751203775405884, "adv/ratio_final_to_reasoning": 1.7790173348114757, "adv/ratio_step_to_reasoning": 1.8487719883867824, "adv/std_final_conf": 0.9322258830070496, "adv/std_reasoning": 0.7017030715942383, "adv/std_step_conf": 0.9357836842536926, "calib/answer_extract_rate": 0.92578125, "calib/avg_num_step_conf": 17.6484375, "calib/ece": 0.0789029535864979, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0019180790960452443, "calib/mean_conf": 0.6679324894514769, "calib/mu_c": 0.6684180790960452, "calib/mu_w": 0.6665, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.05338685025245993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5890498625834315, "calib/step_q_c_n": 2547.0, "calib/step_q_gap": -0.08474567267785726, "calib/step_q_w": 0.6737955352612888, "calib/step_q_w_n": 1971.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 778.875, "completions/mean_terminated_length": 837.7815551757812, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.09386666666666667, "grad_norm": 0.2500193119049072, "kl": 0.06778717041015625, "learning_rate": 3.138888888888889e-06, "loss": -0.2581, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017798837274312973, "mask/share_reasoning": 0.7285441160202026, "mask/share_step_conf": 0.1833444982767105, "num_tokens": 26142193.0, "reward": 0.784960150718689, "reward_std": 0.2499820590019226, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.743010938167572, "rewards/format_reward_step": 0.92578125, "rewards/step_correlation_reward": 0.5034719109535217, "step": 88 }, { "adv/mean_abs_final_conf": 0.7492777109146118, "adv/mean_abs_reasoning": 0.6360772848129272, "adv/mean_abs_step_conf": 0.7455618381500244, "adv/ratio_final_to_reasoning": 1.177966465403614, "adv/ratio_step_to_reasoning": 1.1721246080486853, "adv/std_final_conf": 0.9346681833267212, "adv/std_reasoning": 0.859366774559021, "adv/std_step_conf": 0.9364317655563354, "calib/answer_extract_rate": 0.796875, "calib/avg_num_step_conf": 22.46875, "calib/ece": 0.12450980392156871, "calib/final_conf_rate": 0.796875, "calib/format_rate": 0.796875, "calib/frac_conf_gt_0.9": 0.004901960784313725, "calib/gap": -0.03393914043710122, "calib/mean_conf": 0.6731372549019607, "calib/mu_c": 0.6609923664122138, "calib/mu_w": 0.694931506849315, "calib/nonempty_final_conf_rate": 0.796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07774509803921568, "calib/std_conf": 0.060811189504358695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.58310119695321, "calib/step_q_c_n": 1838.0, "calib/step_q_gap": -0.11488296247448548, "calib/step_q_w": 0.6979841594276955, "calib/step_q_w_n": 3914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 679.6015625, "completions/mean_terminated_length": 848.6731567382812, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.09493333333333333, "grad_norm": 0.9176256656646729, "kl": 0.10498046875, "learning_rate": 3.1111111111111116e-06, "loss": -0.6332, "mask/has_final_conf_rate": 0.796875, "mask/share_final_conf": 0.015897078439593315, "mask/share_reasoning": 0.6220657825469971, "mask/share_step_conf": 0.16281834244728088, "num_tokens": 26425059.0, "reward": 0.5518848896026611, "reward_std": 0.3001466393470764, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5976187586784363, "rewards/format_reward_step": 0.796875, "rewards/step_correlation_reward": 0.24443230032920837, "step": 89 }, { "adv/mean_abs_final_conf": 0.7582546472549438, "adv/mean_abs_reasoning": 0.5267269015312195, "adv/mean_abs_step_conf": 0.748308539390564, "adv/ratio_final_to_reasoning": 1.4395593713756836, "adv/ratio_step_to_reasoning": 1.4206765160754016, "adv/std_final_conf": 0.9336482882499695, "adv/std_reasoning": 0.7757920622825623, "adv/std_step_conf": 0.9350053071975708, "calib/answer_extract_rate": 0.84375, "calib/avg_num_step_conf": 22.69921875, "calib/ece": 0.10333333333333322, "calib/final_conf_rate": 0.84375, "calib/format_rate": 0.84375, "calib/frac_conf_gt_0.9": 0.004629629629629629, "calib/gap": -0.005089285714285685, "calib/mean_conf": 0.6758333333333333, "calib/mu_c": 0.6747023809523809, "calib/mu_w": 0.6797916666666666, "calib/nonempty_final_conf_rate": 0.84375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0006944444444444461, "calib/std_conf": 0.06474486966662414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5961133768352366, "calib/step_q_c_n": 2452.0, "calib/step_q_gap": -0.12455051122668659, "calib/step_q_w": 0.7206638880619232, "calib/step_q_w_n": 3359.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 714.82421875, "completions/mean_terminated_length": 843.294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.096, "grad_norm": 0.6938077807426453, "kl": 0.066070556640625, "learning_rate": 3.0833333333333336e-06, "loss": -0.5306, "mask/has_final_conf_rate": 0.84375, "mask/share_final_conf": 0.01659608632326126, "mask/share_reasoning": 0.6598981618881226, "mask/share_step_conf": 0.1711619645357132, "num_tokens": 26711374.0, "reward": 0.7062116265296936, "reward_std": 0.26097801327705383, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6841264963150024, "rewards/format_reward_step": 0.84375, "rewards/step_correlation_reward": 0.4282967746257782, "step": 90 }, { "adv/mean_abs_final_conf": 0.7301865816116333, "adv/mean_abs_reasoning": 0.39002954959869385, "adv/mean_abs_step_conf": 0.7586683630943298, "adv/ratio_final_to_reasoning": 1.8721314381511123, "adv/ratio_step_to_reasoning": 1.9451561141327187, "adv/std_final_conf": 0.9313138127326965, "adv/std_reasoning": 0.68172287940979, "adv/std_step_conf": 0.9356062412261963, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.78125, "calib/ece": 0.08085020242914978, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0170270486555697, "calib/mean_conf": 0.6633603238866397, "calib/mu_c": 0.6584659090909091, "calib/mu_w": 0.6754929577464788, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01582995951417004, "calib/std_conf": 0.051766797667866085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5833746385790995, "calib/step_q_c_n": 2421.0, "calib/step_q_gap": -0.05881189631898576, "calib/step_q_w": 0.6421865348980853, "calib/step_q_w_n": 1619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2890.0, "completions/max_terminated_length": 2890.0, "completions/mean_length": 859.53125, "completions/mean_terminated_length": 883.6947631835938, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.09706666666666666, "grad_norm": 0.14019861817359924, "kl": 0.05571746826171875, "learning_rate": 3.055555555555556e-06, "loss": -0.1903, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017590008676052094, "mask/share_reasoning": 0.7758537530899048, "mask/share_step_conf": 0.1792125105857849, "num_tokens": 27039126.0, "reward": 0.7212048768997192, "reward_std": 0.21607019007205963, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7555722594261169, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3563687205314636, "step": 91 }, { "adv/mean_abs_final_conf": 0.7461353540420532, "adv/mean_abs_reasoning": 0.33928218483924866, "adv/mean_abs_step_conf": 0.7547565698623657, "adv/ratio_final_to_reasoning": 2.199158657256263, "adv/ratio_step_to_reasoning": 2.2245688208473666, "adv/std_final_conf": 0.9299798011779785, "adv/std_reasoning": 0.6403141021728516, "adv/std_step_conf": 0.9356287717819214, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.00390625, "calib/ece": 0.1760956175298805, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00398406374501992, "calib/gap": -0.027140638481449653, "calib/mean_conf": 0.6447011952191236, "calib/mu_c": 0.6381052631578947, "calib/mu_w": 0.6652459016393444, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03191235059760955, "calib/std_conf": 0.056542216701220574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.576131993006993, "calib/step_q_c_n": 2288.0, "calib/step_q_gap": -0.04979500026870343, "calib/step_q_w": 0.6259269932756965, "calib/step_q_w_n": 1041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 756.109375, "completions/mean_terminated_length": 765.0751342773438, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.09813333333333334, "grad_norm": 0.1883665770292282, "kl": 0.0677642822265625, "learning_rate": 3.0277777777777776e-06, "loss": -0.0579, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.021719403564929962, "mask/share_reasoning": 0.7846254110336304, "mask/share_step_conf": 0.18193641304969788, "num_tokens": 27339410.0, "reward": 0.7869164943695068, "reward_std": 0.21456190943717957, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7748124599456787, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.45448917150497437, "step": 92 }, { "adv/mean_abs_final_conf": 0.755165696144104, "adv/mean_abs_reasoning": 0.4527568519115448, "adv/mean_abs_step_conf": 0.7692996859550476, "adv/ratio_final_to_reasoning": 1.6679277032601174, "adv/ratio_step_to_reasoning": 1.6991453198489548, "adv/std_final_conf": 0.9280458092689514, "adv/std_reasoning": 0.7207328677177429, "adv/std_step_conf": 0.935437023639679, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 13.69921875, "calib/ece": 0.1453658536585366, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.026574675324675168, "calib/mean_conf": 0.648130081300813, "calib/mu_c": 0.6405681818181818, "calib/mu_w": 0.6671428571428569, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03902439024390242, "calib/std_conf": 0.05590860127691766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5751223091976517, "calib/step_q_c_n": 2044.0, "calib/step_q_gap": -0.05254002846468597, "calib/step_q_w": 0.6276623376623377, "calib/step_q_w_n": 1463.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 738.7734375, "completions/mean_terminated_length": 768.8048706054688, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.0992, "grad_norm": 0.18260808289051056, "kl": 0.063751220703125, "learning_rate": 3e-06, "loss": -0.1469, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02047555334866047, "mask/share_reasoning": 0.7621718645095825, "mask/share_step_conf": 0.17829002439975739, "num_tokens": 27634312.0, "reward": 0.7290872931480408, "reward_std": 0.24152934551239014, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7475515604019165, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.38093554973602295, "step": 93 }, { "adv/mean_abs_final_conf": 0.7560976147651672, "adv/mean_abs_reasoning": 0.2771226167678833, "adv/mean_abs_step_conf": 0.7483853101730347, "adv/ratio_final_to_reasoning": 2.7283865300625076, "adv/ratio_step_to_reasoning": 2.700556594411343, "adv/std_final_conf": 0.9292981028556824, "adv/std_reasoning": 0.5726613402366638, "adv/std_step_conf": 0.9354825615882874, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.74609375, "calib/ece": 0.033992094861660126, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.008102249488752489, "calib/mean_conf": 0.6394466403162056, "calib/mu_c": 0.6365644171779141, "calib/mu_w": 0.6446666666666666, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014584980237154203, "calib/std_conf": 0.04471793592128724, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5735790543975597, "calib/step_q_c_n": 1967.0, "calib/step_q_gap": -0.01689934066416854, "calib/step_q_w": 0.5904783950617283, "calib/step_q_w_n": 1296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 730.92578125, "completions/mean_terminated_length": 739.5928955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.10026666666666667, "grad_norm": 0.14898745715618134, "kl": 0.06661224365234375, "learning_rate": 2.9722222222222225e-06, "loss": -0.0483, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021695515140891075, "mask/share_reasoning": 0.7875087261199951, "mask/share_step_conf": 0.17907699942588806, "num_tokens": 27930109.0, "reward": 0.6933168172836304, "reward_std": 0.15646381676197052, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7561109066009521, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.30552273988723755, "step": 94 }, { "adv/mean_abs_final_conf": 0.7589545845985413, "adv/mean_abs_reasoning": 0.41713064908981323, "adv/mean_abs_step_conf": 0.7742445468902588, "adv/ratio_final_to_reasoning": 1.8194649236506455, "adv/ratio_step_to_reasoning": 1.8561200155866626, "adv/std_final_conf": 0.9303404688835144, "adv/std_reasoning": 0.7013149261474609, "adv/std_step_conf": 0.9355771541595459, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.4296875, "calib/ece": 0.09443999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.004, "calib/gap": -0.024019047619047496, "calib/mean_conf": 0.6593199999999999, "calib/mu_c": 0.6521142857142858, "calib/mu_w": 0.6761333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02688, "calib/std_conf": 0.05826780929466974, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5761407652338215, "calib/step_q_c_n": 2117.0, "calib/step_q_gap": -0.03365484415300668, "calib/step_q_w": 0.6097956093868282, "calib/step_q_w_n": 1321.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 832.6953125, "completions/mean_terminated_length": 849.2828979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.10133333333333333, "grad_norm": 0.18183837831020355, "kl": 0.0625762939453125, "learning_rate": 2.944444444444445e-06, "loss": -0.0559, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019320238381624222, "mask/share_reasoning": 0.7809579372406006, "mask/share_step_conf": 0.1801905632019043, "num_tokens": 28249407.0, "reward": 0.6880050897598267, "reward_std": 0.2226249724626541, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7567011117935181, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.28727781772613525, "step": 95 }, { "adv/mean_abs_final_conf": 0.7273871302604675, "adv/mean_abs_reasoning": 0.32089483737945557, "adv/mean_abs_step_conf": 0.755443811416626, "adv/ratio_final_to_reasoning": 2.2667461284219357, "adv/ratio_step_to_reasoning": 2.354178763316531, "adv/std_final_conf": 0.9287962317466736, "adv/std_reasoning": 0.6401641368865967, "adv/std_step_conf": 0.9360931515693665, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.828125, "calib/ece": 0.20127490039840634, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00796812749003984, "calib/gap": -0.049510761439681805, "calib/mean_conf": 0.6471713147410358, "calib/mu_c": 0.6359278350515464, "calib/mu_w": 0.6854385964912282, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03776892430278886, "calib/std_conf": 0.04955143946699769, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5711596119929454, "calib/step_q_c_n": 2268.0, "calib/step_q_gap": -0.03606479745587354, "calib/step_q_w": 0.607224409448819, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 728.52734375, "completions/mean_terminated_length": 746.0120239257812, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.1024, "grad_norm": 0.1968483030796051, "kl": 0.07297515869140625, "learning_rate": 2.916666666666667e-06, "loss": -0.0946, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020755115896463394, "mask/share_reasoning": 0.7726129293441772, "mask/share_step_conf": 0.18319444358348846, "num_tokens": 28541726.0, "reward": 0.7672200202941895, "reward_std": 0.1859196275472641, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7734265327453613, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.4133572578430176, "step": 96 }, { "adv/mean_abs_final_conf": 0.7504986524581909, "adv/mean_abs_reasoning": 0.44124436378479004, "adv/mean_abs_step_conf": 0.771976113319397, "adv/ratio_final_to_reasoning": 1.7008685301286586, "adv/ratio_step_to_reasoning": 1.7495432841288736, "adv/std_final_conf": 0.9311988353729248, "adv/std_reasoning": 0.7205365300178528, "adv/std_step_conf": 0.9355778098106384, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.8828125, "calib/ece": 0.034722222222222224, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0023157747268144346, "calib/mean_conf": 0.6444047619047618, "calib/mu_c": 0.6436144578313252, "calib/mu_w": 0.6459302325581396, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010198412698412694, "calib/std_conf": 0.04115450535066114, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5757505995203838, "calib/step_q_c_n": 2085.0, "calib/step_q_gap": -0.011545360908305491, "calib/step_q_w": 0.5872959604286893, "calib/step_q_w_n": 1213.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 753.28125, "completions/mean_terminated_length": 762.2134399414062, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.10346666666666667, "grad_norm": 0.17708462476730347, "kl": 0.0679931640625, "learning_rate": 2.888888888888889e-06, "loss": 0.0231, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020922204479575157, "mask/share_reasoning": 0.7850449085235596, "mask/share_step_conf": 0.1823141723871231, "num_tokens": 28839638.0, "reward": 0.7043203115463257, "reward_std": 0.244666188955307, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7601886987686157, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.32188940048217773, "step": 97 }, { "adv/mean_abs_final_conf": 0.758463442325592, "adv/mean_abs_reasoning": 0.44855350255966187, "adv/mean_abs_step_conf": 0.7728298306465149, "adv/ratio_final_to_reasoning": 1.690909641764996, "adv/ratio_step_to_reasoning": 1.7229379020259044, "adv/std_final_conf": 0.9319073557853699, "adv/std_reasoning": 0.7205816507339478, "adv/std_step_conf": 0.9360450506210327, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.90625, "calib/ece": 0.0566269841269842, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.006484848484848538, "calib/mean_conf": 0.6557539682539683, "calib/mu_c": 0.6535151515151515, "calib/mu_w": 0.66, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028809523809523823, "calib/std_conf": 0.04955983821218589, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5806123388581953, "calib/step_q_c_n": 2172.0, "calib/step_q_gap": -0.022355960853620238, "calib/step_q_w": 0.6029682997118155, "calib/step_q_w_n": 1388.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 809.77734375, "completions/mean_terminated_length": 822.6309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.10453333333333334, "grad_norm": 0.36100202798843384, "kl": 0.07733917236328125, "learning_rate": 2.861111111111111e-06, "loss": -0.0337, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019724491983652115, "mask/share_reasoning": 0.7813058495521545, "mask/share_step_conf": 0.18334467709064484, "num_tokens": 29153125.0, "reward": 0.6995540261268616, "reward_std": 0.24638350307941437, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7565535306930542, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3167732357978821, "step": 98 }, { "adv/mean_abs_final_conf": 0.7385193109512329, "adv/mean_abs_reasoning": 0.4850810766220093, "adv/mean_abs_step_conf": 0.7860969305038452, "adv/ratio_final_to_reasoning": 1.522465720770037, "adv/ratio_step_to_reasoning": 1.6205475092494634, "adv/std_final_conf": 0.9335052967071533, "adv/std_reasoning": 0.757537305355072, "adv/std_step_conf": 0.9362009167671204, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 15.12109375, "calib/ece": 0.17814516129032254, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.010933646506165573, "calib/mean_conf": 0.6665322580645162, "calib/mu_c": 0.6613740458015268, "calib/mu_w": 0.6723076923076924, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15822580645161285, "calib/std_conf": 0.0557361924871592, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.588147549811524, "calib/step_q_c_n": 1857.0, "calib/step_q_gap": -0.013004386633361786, "calib/step_q_w": 0.6011519364448857, "calib/step_q_w_n": 2014.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3000.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 914.36328125, "completions/mean_terminated_length": 940.0682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.1056, "grad_norm": 0.23129351437091827, "kl": 0.0624847412109375, "learning_rate": 2.8333333333333335e-06, "loss": -0.0659, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018017534166574478, "mask/share_reasoning": 0.7806091904640198, "mask/share_step_conf": 0.17402949929237366, "num_tokens": 29493002.0, "reward": 0.5717493295669556, "reward_std": 0.27225303649902344, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7005148530006409, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.14689014852046967, "step": 99 }, { "adv/mean_abs_final_conf": 0.7555106282234192, "adv/mean_abs_reasoning": 0.4237300455570221, "adv/mean_abs_step_conf": 0.792742133140564, "adv/ratio_final_to_reasoning": 1.7829998985090822, "adv/ratio_step_to_reasoning": 1.870865994641589, "adv/std_final_conf": 0.9318886399269104, "adv/std_reasoning": 0.701408863067627, "adv/std_step_conf": 0.9356249570846558, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.87109375, "calib/ece": 0.05052, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0036363636363635488, "calib/mean_conf": 0.65276, "calib/mu_c": 0.6513636363636364, "calib/mu_w": 0.6549999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04364, "calib/std_conf": 0.050714715813065546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5829321446260525, "calib/step_q_c_n": 2019.0, "calib/step_q_gap": -0.01775976137916946, "calib/step_q_w": 0.6006919060052219, "calib/step_q_w_n": 1532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 833.890625, "completions/mean_terminated_length": 857.3333129882812, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.10666666666666667, "grad_norm": 0.17867591977119446, "kl": 0.0666046142578125, "learning_rate": 2.805555555555556e-06, "loss": -0.1227, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019194157794117928, "mask/share_reasoning": 0.7794458270072937, "mask/share_step_conf": 0.17401626706123352, "num_tokens": 29813886.0, "reward": 0.6280609965324402, "reward_std": 0.23585641384124756, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7400511503219604, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.20044583082199097, "step": 100 }, { "adv/mean_abs_final_conf": 0.7429351210594177, "adv/mean_abs_reasoning": 0.4691013693809509, "adv/mean_abs_step_conf": 0.7799124717712402, "adv/ratio_final_to_reasoning": 1.5837411049126358, "adv/ratio_step_to_reasoning": 1.6625670327938944, "adv/std_final_conf": 0.9320369362831116, "adv/std_reasoning": 0.7393046021461487, "adv/std_step_conf": 0.9358084797859192, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.81640625, "calib/ece": 0.07523809523809512, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": 0.0018114289437675124, "calib/mean_conf": 0.6665079365079364, "calib/mu_c": 0.667248322147651, "calib/mu_w": 0.6654368932038834, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07523809523809512, "calib/std_conf": 0.057173051987101216, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5900836042731074, "calib/step_q_c_n": 2153.0, "calib/step_q_gap": -0.017337127434209787, "calib/step_q_w": 0.6074207317073171, "calib/step_q_w_n": 1640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 876.9765625, "completions/mean_terminated_length": 883.8818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.10773333333333333, "grad_norm": 0.20067480206489563, "kl": 0.0763702392578125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0548, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01860753819346428, "mask/share_reasoning": 0.790267825126648, "mask/share_step_conf": 0.18331214785575867, "num_tokens": 30145384.0, "reward": 0.6333275437355042, "reward_std": 0.24875670671463013, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7385531067848206, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2148207128047943, "step": 101 }, { "adv/mean_abs_final_conf": 0.7533099055290222, "adv/mean_abs_reasoning": 0.22623670101165771, "adv/mean_abs_step_conf": 0.7874011993408203, "adv/ratio_final_to_reasoning": 3.329742266221452, "adv/ratio_step_to_reasoning": 3.4804308753611397, "adv/std_final_conf": 0.9287867546081543, "adv/std_reasoning": 0.5226889252662659, "adv/std_step_conf": 0.9349228143692017, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.01953125, "calib/ece": 0.17559055118110245, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.031882022471910076, "calib/mean_conf": 0.6501574803149606, "calib/mu_c": 0.6406179775280899, "calib/mu_w": 0.6725, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06248031496062993, "calib/std_conf": 0.061432816539549094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5780414312617702, "calib/step_q_c_n": 2124.0, "calib/step_q_gap": -0.025589668820942713, "calib/step_q_w": 0.6036311000827129, "calib/step_q_w_n": 1209.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 713.87109375, "completions/mean_terminated_length": 722.3359985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.1088, "grad_norm": 0.19545966386795044, "kl": 0.086181640625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0009, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02277192659676075, "mask/share_reasoning": 0.775929868221283, "mask/share_step_conf": 0.18957942724227905, "num_tokens": 30434831.0, "reward": 0.7263565063476562, "reward_std": 0.15422692894935608, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7645875215530396, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.35062551498413086, "step": 102 }, { "adv/mean_abs_final_conf": 0.7185190916061401, "adv/mean_abs_reasoning": 0.3661821484565735, "adv/mean_abs_step_conf": 0.7697737216949463, "adv/ratio_final_to_reasoning": 1.9621903870372621, "adv/ratio_step_to_reasoning": 2.102160700458711, "adv/std_final_conf": 0.9287464022636414, "adv/std_reasoning": 0.6611608266830444, "adv/std_step_conf": 0.9357419610023499, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.0, "calib/ece": 0.1333333333333334, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.03233678901653125, "calib/mean_conf": 0.6640476190476191, "calib/mu_c": 0.6530120481927711, "calib/mu_w": 0.6853488372093024, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06932539682539682, "calib/std_conf": 0.07228337721298855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5885881801125704, "calib/step_q_c_n": 2132.0, "calib/step_q_gap": -0.021466916306162376, "calib/step_q_w": 0.6100550964187328, "calib/step_q_w_n": 1452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 879.5703125, "completions/mean_terminated_length": 893.5317993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.10986666666666667, "grad_norm": 0.18727880716323853, "kl": 0.0642547607421875, "learning_rate": 2.7222222222222224e-06, "loss": -0.0503, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019160684198141098, "mask/share_reasoning": 0.7917564511299133, "mask/share_step_conf": 0.17345784604549408, "num_tokens": 30764553.0, "reward": 0.6658133268356323, "reward_std": 0.18848654627799988, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7436000108718872, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.26146411895751953, "step": 103 }, { "adv/mean_abs_final_conf": 0.7070366144180298, "adv/mean_abs_reasoning": 0.3533375561237335, "adv/mean_abs_step_conf": 0.7518788576126099, "adv/ratio_final_to_reasoning": 2.0010231071231956, "adv/ratio_step_to_reasoning": 2.1279335994199076, "adv/std_final_conf": 0.915729820728302, "adv/std_reasoning": 0.661066472530365, "adv/std_step_conf": 0.9352713823318481, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.3046875, "calib/ece": 0.083203125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.0003262764632627757, "calib/mean_conf": 0.653359375, "calib/mu_c": 0.6532191780821918, "calib/mu_w": 0.6535454545454545, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.083125, "calib/std_conf": 0.05210082388609009, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5790608604407136, "calib/step_q_c_n": 1906.0, "calib/step_q_gap": -0.0037258062259530877, "calib/step_q_w": 0.5827866666666667, "calib/step_q_w_n": 1500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1925.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 782.90625, "completions/mean_terminated_length": 789.0708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.11093333333333333, "grad_norm": 0.20894844830036163, "kl": 0.07109832763671875, "learning_rate": 2.6944444444444444e-06, "loss": 0.0184, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02064879611134529, "mask/share_reasoning": 0.7865666747093201, "mask/share_step_conf": 0.18497204780578613, "num_tokens": 31071657.0, "reward": 0.646728515625, "reward_std": 0.18793892860412598, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7451726794242859, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.23422178626060486, "step": 104 }, { "adv/mean_abs_final_conf": 0.7167232632637024, "adv/mean_abs_reasoning": 0.33493661880493164, "adv/mean_abs_step_conf": 0.7485485076904297, "adv/ratio_final_to_reasoning": 2.1398772872939427, "adv/ratio_step_to_reasoning": 2.2348959942370086, "adv/std_final_conf": 0.9292554259300232, "adv/std_reasoning": 0.6611365079879761, "adv/std_step_conf": 0.9348087906837463, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.68359375, "calib/ece": 0.2034126984126984, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.04890624999999982, "calib/mean_conf": 0.6552380952380953, "calib/mu_c": 0.64359375, "calib/mu_w": 0.6924999999999998, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04837301587301586, "calib/std_conf": 0.05699783184005442, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5769583161370201, "calib/step_q_c_n": 2423.0, "calib/step_q_gap": -0.03884723941853552, "calib/step_q_w": 0.6158055555555556, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 818.63671875, "completions/mean_terminated_length": 828.3439331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.112, "grad_norm": 0.16072368621826172, "kl": 0.06591796875, "learning_rate": 2.666666666666667e-06, "loss": -0.048, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019962280988693237, "mask/share_reasoning": 0.7864009141921997, "mask/share_step_conf": 0.18191802501678467, "num_tokens": 31386988.0, "reward": 0.8123526573181152, "reward_std": 0.18710803985595703, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7739390134811401, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.5038912296295166, "step": 105 }, { "adv/mean_abs_final_conf": 0.7355873584747314, "adv/mean_abs_reasoning": 0.3430485725402832, "adv/mean_abs_step_conf": 0.743608832359314, "adv/ratio_final_to_reasoning": 2.14426590680057, "adv/ratio_step_to_reasoning": 2.1676488167633874, "adv/std_final_conf": 0.9301002025604248, "adv/std_reasoning": 0.640177309513092, "adv/std_step_conf": 0.9360626935958862, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.17578125, "calib/ece": 0.039764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.008565789473684138, "calib/mean_conf": 0.6529411764705882, "calib/mu_c": 0.64975, "calib/mu_w": 0.6583157894736842, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03262745098039218, "calib/std_conf": 0.04668182321015065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5782606541129832, "calib/step_q_c_n": 2018.0, "calib/step_q_gap": -0.009008718580743702, "calib/step_q_w": 0.5872693726937269, "calib/step_q_w_n": 1355.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 783.30859375, "completions/mean_terminated_length": 789.4763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.11306666666666666, "grad_norm": 0.1853887140750885, "kl": 0.06472015380859375, "learning_rate": 2.6388888888888893e-06, "loss": -0.0049, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019807640463113785, "mask/share_reasoning": 0.7897074222564697, "mask/share_step_conf": 0.1826724410057068, "num_tokens": 31692099.0, "reward": 0.6519793272018433, "reward_std": 0.19260109961032867, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7564437389373779, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2232961654663086, "step": 106 }, { "adv/mean_abs_final_conf": 0.7967756390571594, "adv/mean_abs_reasoning": 0.45540252327919006, "adv/mean_abs_step_conf": 0.7411437034606934, "adv/ratio_final_to_reasoning": 1.749607431508864, "adv/ratio_step_to_reasoning": 1.6274475119812328, "adv/std_final_conf": 0.9317139983177185, "adv/std_reasoning": 0.7013875842094421, "adv/std_step_conf": 0.9358682632446289, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.51953125, "calib/ece": 0.026601562499999974, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.011492768959435629, "calib/mean_conf": 0.6569921875, "calib/mu_c": 0.6606285714285713, "calib/mu_w": 0.6491358024691357, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.0451639368187146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5853536184210526, "calib/step_q_c_n": 2432.0, "calib/step_q_gap": 0.009056242327758146, "calib/step_q_w": 0.5762973760932945, "calib/step_q_w_n": 1029.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 778.984375, "completions/mean_terminated_length": 785.1181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.11413333333333334, "grad_norm": 0.1831437200307846, "kl": 0.07171630859375, "learning_rate": 2.6111111111111113e-06, "loss": -0.004, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020345225930213928, "mask/share_reasoning": 0.779441237449646, "mask/share_step_conf": 0.19240108132362366, "num_tokens": 31996135.0, "reward": 0.7552859783172607, "reward_std": 0.21182772517204285, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.785930871963501, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.38792234659194946, "step": 107 }, { "adv/mean_abs_final_conf": 0.7637505531311035, "adv/mean_abs_reasoning": 0.30685943365097046, "adv/mean_abs_step_conf": 0.7783085107803345, "adv/ratio_final_to_reasoning": 2.488926424858792, "adv/ratio_step_to_reasoning": 2.536368204555842, "adv/std_final_conf": 0.9282589554786682, "adv/std_reasoning": 0.5959946513175964, "adv/std_step_conf": 0.9353668689727783, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.3125, "calib/ece": 0.1579446640316206, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014256097560975567, "calib/mean_conf": 0.6609486166007905, "calib/mu_c": 0.6582439024390244, "calib/mu_w": 0.6725, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004308300395256917, "calib/std_conf": 0.04583750005322809, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5833650793650794, "calib/step_q_c_n": 2835.0, "calib/step_q_gap": -0.02138763474831029, "calib/step_q_w": 0.6047527141133897, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 815.5625, "completions/mean_terminated_length": 828.5079956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.1152, "grad_norm": 0.1538919061422348, "kl": 0.06288909912109375, "learning_rate": 2.5833333333333337e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01957569271326065, "mask/share_reasoning": 0.7753883600234985, "mask/share_step_conf": 0.18941092491149902, "num_tokens": 32308151.0, "reward": 0.7745775580406189, "reward_std": 0.19146263599395752, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8079085946083069, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.383433997631073, "step": 108 }, { "adv/mean_abs_final_conf": 0.7414380311965942, "adv/mean_abs_reasoning": 0.33757272362709045, "adv/mean_abs_step_conf": 0.7734533548355103, "adv/ratio_final_to_reasoning": 2.196380155452504, "adv/ratio_step_to_reasoning": 2.291219937810876, "adv/std_final_conf": 0.9311279058456421, "adv/std_reasoning": 0.6185241937637329, "adv/std_step_conf": 0.9362562298774719, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 14.640625, "calib/ece": 0.11004048582995961, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.016194331983805668, "calib/gap": -0.017967011128775634, "calib/mean_conf": 0.6606477732793522, "calib/mu_c": 0.6525735294117647, "calib/mu_w": 0.6705405405405404, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11004048582995961, "calib/std_conf": 0.06774255733281026, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.577875150060024, "calib/step_q_c_n": 1666.0, "calib/step_q_gap": -0.048762698162838625, "calib/step_q_w": 0.6266378482228626, "calib/step_q_w_n": 2082.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 815.49609375, "completions/mean_terminated_length": 838.421630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.11626666666666667, "grad_norm": 0.1558358371257782, "kl": 0.06879425048828125, "learning_rate": 2.5555555555555557e-06, "loss": -0.0197, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019493218511343002, "mask/share_reasoning": 0.7717585563659668, "mask/share_step_conf": 0.1814044862985611, "num_tokens": 32621518.0, "reward": 0.5735743641853333, "reward_std": 0.20711404085159302, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7014141082763672, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.14651596546173096, "step": 109 }, { "adv/mean_abs_final_conf": 0.7351251840591431, "adv/mean_abs_reasoning": 0.3662223219871521, "adv/mean_abs_step_conf": 0.7691531181335449, "adv/ratio_final_to_reasoning": 2.0073194339173375, "adv/ratio_step_to_reasoning": 2.1002354907261185, "adv/std_final_conf": 0.9158840179443359, "adv/std_reasoning": 0.6611154079437256, "adv/std_step_conf": 0.9360529184341431, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.3203125, "calib/ece": 0.11093750000000009, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.006303030303030366, "calib/mean_conf": 0.650625, "calib/mu_c": 0.6489999999999999, "calib/mu_w": 0.6553030303030303, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009687500000000012, "calib/std_conf": 0.050003906097424035, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5814681107099879, "calib/step_q_c_n": 2493.0, "calib/step_q_gap": -0.00413712375020836, "calib/step_q_w": 0.5856052344601963, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 742.68359375, "completions/mean_terminated_length": 748.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.11733333333333333, "grad_norm": 0.1851894110441208, "kl": 0.07032012939453125, "learning_rate": 2.5277777777777778e-06, "loss": 0.0597, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021798258647322655, "mask/share_reasoning": 0.7785544395446777, "mask/share_step_conf": 0.19183480739593506, "num_tokens": 32916565.0, "reward": 0.7617905139923096, "reward_std": 0.21278932690620422, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7953585982322693, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.37978485226631165, "step": 110 }, { "adv/mean_abs_final_conf": 0.7544980049133301, "adv/mean_abs_reasoning": 0.3003673851490021, "adv/mean_abs_step_conf": 0.751507043838501, "adv/ratio_final_to_reasoning": 2.5119172127794407, "adv/ratio_step_to_reasoning": 2.501959536870835, "adv/std_final_conf": 0.9297202229499817, "adv/std_reasoning": 0.5959699749946594, "adv/std_step_conf": 0.935775101184845, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.90234375, "calib/ece": 0.19356862745098033, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.040253217449440104, "calib/mean_conf": 0.6593725490196077, "calib/mu_c": 0.6495854922279793, "calib/mu_w": 0.6898387096774194, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04803921568627452, "calib/std_conf": 0.05574098389568812, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5818112449799197, "calib/step_q_c_n": 2490.0, "calib/step_q_gap": -0.02305311423429912, "calib/step_q_w": 0.6048643592142188, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 826.3203125, "completions/mean_terminated_length": 829.5608520507812, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.1184, "grad_norm": 0.17247696220874786, "kl": 0.060150146484375, "learning_rate": 2.5e-06, "loss": 0.0448, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020120007917284966, "mask/share_reasoning": 0.7870587110519409, "mask/share_step_conf": 0.18891501426696777, "num_tokens": 33235511.0, "reward": 0.7798572182655334, "reward_std": 0.18497300148010254, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7854719161987305, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.42424261569976807, "step": 111 }, { "adv/mean_abs_final_conf": 0.748249888420105, "adv/mean_abs_reasoning": 0.37312665581703186, "adv/mean_abs_step_conf": 0.7734876871109009, "adv/ratio_final_to_reasoning": 2.005350935814729, "adv/ratio_step_to_reasoning": 2.072989627120588, "adv/std_final_conf": 0.9300276041030884, "adv/std_reasoning": 0.6611367464065552, "adv/std_step_conf": 0.935664176940918, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.75, "calib/ece": 0.060157480314960605, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.013223016683690858, "calib/mean_conf": 0.6651181102362205, "calib/mu_c": 0.6604848484848485, "calib/mu_w": 0.6737078651685393, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.037834645669291324, "calib/std_conf": 0.04989542042217861, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5863486005089059, "calib/step_q_c_n": 2358.0, "calib/step_q_gap": -0.00951176620477534, "calib/step_q_w": 0.5958603667136813, "calib/step_q_w_n": 1418.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 875.0078125, "completions/mean_terminated_length": 881.8976440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.11946666666666667, "grad_norm": 0.16523876786231995, "kl": 0.0580902099609375, "learning_rate": 2.4722222222222226e-06, "loss": -0.0158, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017950210720300674, "mask/share_reasoning": 0.7930775284767151, "mask/share_step_conf": 0.18115977942943573, "num_tokens": 33567433.0, "reward": 0.714263379573822, "reward_std": 0.2062770277261734, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7576664686203003, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.34351664781570435, "step": 112 }, { "adv/mean_abs_final_conf": 0.7786873579025269, "adv/mean_abs_reasoning": 0.37065157294273376, "adv/mean_abs_step_conf": 0.753793478012085, "adv/ratio_final_to_reasoning": 2.100860794196158, "adv/ratio_step_to_reasoning": 2.0336983114018707, "adv/std_final_conf": 0.9285604953765869, "adv/std_reasoning": 0.640232264995575, "adv/std_step_conf": 0.9355056881904602, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.9765625, "calib/ece": 0.05321568627450977, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.016989413025998434, "calib/mean_conf": 0.6503921568627451, "calib/mu_c": 0.6443292682926829, "calib/mu_w": 0.6613186813186813, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.030235294117647044, "calib/std_conf": 0.054267340566962446, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5788158554942299, "calib/step_q_c_n": 1993.0, "calib/step_q_gap": -0.012749231036996655, "calib/step_q_w": 0.5915650865312265, "calib/step_q_w_n": 1329.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 730.3125, "completions/mean_terminated_length": 733.176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.12053333333333334, "grad_norm": 0.23360399901866913, "kl": 0.07427215576171875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0025, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022047486156225204, "mask/share_reasoning": 0.7772487998008728, "mask/share_step_conf": 0.19679749011993408, "num_tokens": 33859593.0, "reward": 0.6659376621246338, "reward_std": 0.18285523355007172, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7567245960235596, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.24780693650245667, "step": 113 }, { "adv/mean_abs_final_conf": 0.7306428551673889, "adv/mean_abs_reasoning": 0.27716708183288574, "adv/mean_abs_step_conf": 0.7564301490783691, "adv/ratio_final_to_reasoning": 2.636109780193596, "adv/ratio_step_to_reasoning": 2.7291485845871435, "adv/std_final_conf": 0.9272521138191223, "adv/std_reasoning": 0.5726190209388733, "adv/std_step_conf": 0.9354404211044312, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.015625, "calib/ece": 0.18437007874015748, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.036960451977401054, "calib/mean_conf": 0.649251968503937, "calib/mu_c": 0.6406666666666668, "calib/mu_w": 0.6776271186440679, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0329527559055118, "calib/std_conf": 0.050554323865316636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5732784755305326, "calib/step_q_c_n": 2309.0, "calib/step_q_gap": -0.03767949123388581, "calib/step_q_w": 0.6109579667644184, "calib/step_q_w_n": 1023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 745.19140625, "completions/mean_terminated_length": 751.05908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.1216, "grad_norm": 0.15612445771694183, "kl": 0.0630950927734375, "learning_rate": 2.4166666666666667e-06, "loss": -0.0114, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021982520818710327, "mask/share_reasoning": 0.775760293006897, "mask/share_step_conf": 0.1944447159767151, "num_tokens": 34155386.0, "reward": 0.7840208411216736, "reward_std": 0.17199936509132385, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7857136726379395, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.43154671788215637, "step": 114 }, { "adv/mean_abs_final_conf": 0.7011224031448364, "adv/mean_abs_reasoning": 0.2878670394420624, "adv/mean_abs_step_conf": 0.7257644534111023, "adv/ratio_final_to_reasoning": 2.435577218231502, "adv/ratio_step_to_reasoning": 2.5211794126127227, "adv/std_final_conf": 0.9287891983985901, "adv/std_reasoning": 0.618270218372345, "adv/std_step_conf": 0.9357025623321533, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.66015625, "calib/ece": 0.012755905511811032, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0004261003138983366, "calib/mean_conf": 0.6466141732283465, "calib/mu_c": 0.6467701863354037, "calib/mu_w": 0.6463440860215054, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012755905511811032, "calib/std_conf": 0.03710424710798357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5752891744933267, "calib/step_q_c_n": 2023.0, "calib/step_q_gap": -0.006090135851500866, "calib/step_q_w": 0.5813793103448276, "calib/step_q_w_n": 1218.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 743.84765625, "completions/mean_terminated_length": 749.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.12266666666666666, "grad_norm": 0.20659328997135162, "kl": 0.06292724609375, "learning_rate": 2.388888888888889e-06, "loss": -0.0252, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020182587206363678, "mask/share_reasoning": 0.7855570316314697, "mask/share_step_conf": 0.1864479035139084, "num_tokens": 34451075.0, "reward": 0.657963752746582, "reward_std": 0.1874353587627411, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7605875134468079, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.2311212718486786, "step": 115 }, { "adv/mean_abs_final_conf": 0.7466577291488647, "adv/mean_abs_reasoning": 0.3631676137447357, "adv/mean_abs_step_conf": 0.7654522657394409, "adv/ratio_final_to_reasoning": 2.055959014213414, "adv/ratio_step_to_reasoning": 2.1077107009807987, "adv/std_final_conf": 0.9302747249603271, "adv/std_reasoning": 0.640287458896637, "adv/std_step_conf": 0.935708224773407, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.53515625, "calib/ece": 0.08755905511811027, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.003269878574226337, "calib/mean_conf": 0.6592125984251969, "calib/mu_c": 0.6583243243243243, "calib/mu_w": 0.6615942028985506, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0092125984251969, "calib/std_conf": 0.05507484240991102, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5850990699555196, "calib/step_q_c_n": 2473.0, "calib/step_q_gap": -0.004890849399319053, "calib/step_q_w": 0.5899899193548387, "calib/step_q_w_n": 992.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 839.34765625, "completions/mean_terminated_length": 849.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.12373333333333333, "grad_norm": 0.1914065182209015, "kl": 0.0562591552734375, "learning_rate": 2.361111111111111e-06, "loss": -0.0423, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01951763778924942, "mask/share_reasoning": 0.7906308174133301, "mask/share_step_conf": 0.1781328320503235, "num_tokens": 34770468.0, "reward": 0.7317217588424683, "reward_std": 0.22887194156646729, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7868398427963257, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.33363497257232666, "step": 116 }, { "adv/mean_abs_final_conf": 0.7924519777297974, "adv/mean_abs_reasoning": 0.33524632453918457, "adv/mean_abs_step_conf": 0.7852796316146851, "adv/ratio_final_to_reasoning": 2.363790203573651, "adv/ratio_step_to_reasoning": 2.3423959463063384, "adv/std_final_conf": 0.9294283390045166, "adv/std_reasoning": 0.5961469411849976, "adv/std_step_conf": 0.935607373714447, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.48828125, "calib/ece": 0.07921568627450978, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": -0.018871558474362282, "calib/mean_conf": 0.6596078431372548, "calib/mu_c": 0.6516891891891892, "calib/mu_w": 0.6705607476635514, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07921568627450978, "calib/std_conf": 0.06135594588422498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5792249730893434, "calib/step_q_c_n": 1858.0, "calib/step_q_gap": -0.018856531612850946, "calib/step_q_w": 0.5980815047021943, "calib/step_q_w_n": 1595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 792.65625, "completions/mean_terminated_length": 798.8976440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.1248, "grad_norm": 0.1906462460756302, "kl": 0.05759429931640625, "learning_rate": 2.3333333333333336e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01992100104689598, "mask/share_reasoning": 0.7847388982772827, "mask/share_step_conf": 0.18752756714820862, "num_tokens": 35079988.0, "reward": 0.6975091695785522, "reward_std": 0.19168932735919952, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7343515753746033, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3458230495452881, "step": 117 }, { "adv/mean_abs_final_conf": 0.751489520072937, "adv/mean_abs_reasoning": 0.23559094965457916, "adv/mean_abs_step_conf": 0.7602692246437073, "adv/ratio_final_to_reasoning": 3.1898064045956036, "adv/ratio_step_to_reasoning": 3.227073135697299, "adv/std_final_conf": 0.9276825189590454, "adv/std_reasoning": 0.5226951241493225, "adv/std_step_conf": 0.9354988932609558, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.7734375, "calib/ece": 0.19570312500000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.0464843909406244, "calib/mean_conf": 0.6641406249999999, "calib/mu_c": 0.6483431952662722, "calib/mu_w": 0.6948275862068966, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09984375000000002, "calib/std_conf": 0.06216821112601982, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5820426977195535, "calib/step_q_c_n": 2061.0, "calib/step_q_gap": -0.02276958896986636, "calib/step_q_w": 0.6048122866894199, "calib/step_q_w_n": 1465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 854.04296875, "completions/mean_terminated_length": 860.7677001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.12586666666666665, "grad_norm": 0.1517084538936615, "kl": 0.05948638916015625, "learning_rate": 2.305555555555556e-06, "loss": -0.0313, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018919751048088074, "mask/share_reasoning": 0.7929970026016235, "mask/share_step_conf": 0.1802707016468048, "num_tokens": 35402631.0, "reward": 0.7126184105873108, "reward_std": 0.18107867240905762, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7509117126464844, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.34229379892349243, "step": 118 }, { "adv/mean_abs_final_conf": 0.7432209849357605, "adv/mean_abs_reasoning": 0.3506610691547394, "adv/mean_abs_step_conf": 0.7597742080688477, "adv/ratio_final_to_reasoning": 2.119485310209308, "adv/ratio_step_to_reasoning": 2.1666910726653126, "adv/std_final_conf": 0.9295665621757507, "adv/std_reasoning": 0.6611495614051819, "adv/std_step_conf": 0.9353212118148804, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.15625, "calib/ece": 0.15254901960784312, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.01568627450980392, "calib/gap": -0.03208333333333324, "calib/mean_conf": 0.6621960784313726, "calib/mu_c": 0.65125, "calib/mu_w": 0.6833333333333332, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0779607843137255, "calib/std_conf": 0.06584504078809585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5848651162790698, "calib/step_q_c_n": 2150.0, "calib/step_q_gap": -0.03094899498280268, "calib/step_q_w": 0.6158141112618725, "calib/step_q_w_n": 1474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 848.875, "completions/mean_terminated_length": 855.55908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.12693333333333334, "grad_norm": 0.16920378804206848, "kl": 0.05426788330078125, "learning_rate": 2.277777777777778e-06, "loss": 0.0294, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019031208008527756, "mask/share_reasoning": 0.7919583320617676, "mask/share_step_conf": 0.18119795620441437, "num_tokens": 35725007.0, "reward": 0.7125355005264282, "reward_std": 0.19247612357139587, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7510000467300415, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3443836271762848, "step": 119 }, { "adv/mean_abs_final_conf": 0.7433023452758789, "adv/mean_abs_reasoning": 0.2874475121498108, "adv/mean_abs_step_conf": 0.766741156578064, "adv/ratio_final_to_reasoning": 2.5858715551814813, "adv/ratio_step_to_reasoning": 2.667412742046822, "adv/std_final_conf": 0.926930844783783, "adv/std_reasoning": 0.5958617925643921, "adv/std_step_conf": 0.9352979063987732, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.84375, "calib/ece": 0.14555118110236218, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.021628289473684337, "calib/mean_conf": 0.6541338582677165, "calib/mu_c": 0.6486842105263158, "calib/mu_w": 0.6703125000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025826771653543287, "calib/std_conf": 0.043406132264746156, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5776756066411239, "calib/step_q_c_n": 2349.0, "calib/step_q_gap": -0.009310548843434185, "calib/step_q_w": 0.5869861554845581, "calib/step_q_w_n": 939.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 796.8125, "completions/mean_terminated_length": 803.0866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.128, "grad_norm": 0.16395922005176544, "kl": 0.0589599609375, "learning_rate": 2.25e-06, "loss": -0.0356, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019536158069968224, "mask/share_reasoning": 0.7872902154922485, "mask/share_step_conf": 0.1853610873222351, "num_tokens": 36035679.0, "reward": 0.7953190207481384, "reward_std": 0.16985073685646057, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7864730358123779, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.45728999376296997, "step": 120 }, { "adv/mean_abs_final_conf": 0.7642772793769836, "adv/mean_abs_reasoning": 0.3697505593299866, "adv/mean_abs_step_conf": 0.741147518157959, "adv/ratio_final_to_reasoning": 2.0670077707574173, "adv/ratio_step_to_reasoning": 2.0044527302432464, "adv/std_final_conf": 0.9294301271438599, "adv/std_reasoning": 0.6402899622917175, "adv/std_step_conf": 0.9361931085586548, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.98828125, "calib/ece": 0.13168627450980389, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.017973352033660683, "calib/mean_conf": 0.6667450980392158, "calib/mu_c": 0.6618817204301075, "calib/mu_w": 0.6798550724637682, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03450980392156862, "calib/std_conf": 0.05298162579658359, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5841450777202072, "calib/step_q_c_n": 2509.0, "calib/step_q_gap": -0.017767235712628615, "calib/step_q_w": 0.6019123134328358, "calib/step_q_w_n": 1072.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 884.28125, "completions/mean_terminated_length": 891.2440795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.12906666666666666, "grad_norm": 0.18738946318626404, "kl": 0.05516815185546875, "learning_rate": 2.222222222222222e-06, "loss": -0.0092, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017908845096826553, "mask/share_reasoning": 0.7953507900238037, "mask/share_step_conf": 0.17892783880233765, "num_tokens": 36367111.0, "reward": 0.7506467700004578, "reward_std": 0.22363224625587463, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7857195138931274, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.37104272842407227, "step": 121 }, { "adv/mean_abs_final_conf": 0.7539645433425903, "adv/mean_abs_reasoning": 0.3049893379211426, "adv/mean_abs_step_conf": 0.7673645615577698, "adv/ratio_final_to_reasoning": 2.4721013150221465, "adv/ratio_step_to_reasoning": 2.516037336872996, "adv/std_final_conf": 0.9275853037834167, "adv/std_reasoning": 0.5960739850997925, "adv/std_step_conf": 0.9354673027992249, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.76171875, "calib/ece": 0.09349397590361447, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007713487629688598, "calib/mean_conf": 0.6608835341365462, "calib/mu_c": 0.6587150837988827, "calib/mu_w": 0.6664285714285713, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01775100401606426, "calib/std_conf": 0.044143141684649084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.579853268119164, "calib/step_q_c_n": 2249.0, "calib/step_q_gap": -0.028584722461683643, "calib/step_q_w": 0.6084379905808477, "calib/step_q_w_n": 1274.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 832.37890625, "completions/mean_terminated_length": 855.7791137695312, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.13013333333333332, "grad_norm": 0.20258116722106934, "kl": 0.0553741455078125, "learning_rate": 2.1944444444444445e-06, "loss": -0.152, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01726049929857254, "mask/share_reasoning": 0.7798194289207458, "mask/share_step_conf": 0.1755763292312622, "num_tokens": 36687544.0, "reward": 0.7311630249023438, "reward_std": 0.16484180092811584, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7678898572921753, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3600612282752991, "step": 122 }, { "adv/mean_abs_final_conf": 0.7517590522766113, "adv/mean_abs_reasoning": 0.42011213302612305, "adv/mean_abs_step_conf": 0.7539490461349487, "adv/ratio_final_to_reasoning": 1.7894247587223722, "adv/ratio_step_to_reasoning": 1.7946376380614253, "adv/std_final_conf": 0.9308037757873535, "adv/std_reasoning": 0.701386034488678, "adv/std_step_conf": 0.9355675578117371, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.4375, "calib/ece": 0.03579999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.012561142774691447, "calib/mean_conf": 0.66932, "calib/mu_c": 0.6651497005988025, "calib/mu_w": 0.677710843373494, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01856000000000001, "calib/std_conf": 0.05720085314049084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5876273400087071, "calib/step_q_c_n": 2297.0, "calib/step_q_gap": -0.01649703454454532, "calib/step_q_w": 0.6041243745532524, "calib/step_q_w_n": 1399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 932.6171875, "completions/mean_terminated_length": 955.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.1312, "grad_norm": 0.1839117556810379, "kl": 0.0541534423828125, "learning_rate": 2.166666666666667e-06, "loss": -0.1264, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016855834051966667, "mask/share_reasoning": 0.7892988920211792, "mask/share_step_conf": 0.17040777206420898, "num_tokens": 37031582.0, "reward": 0.7302656173706055, "reward_std": 0.21496935188770294, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7513464689254761, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3834035396575928, "step": 123 }, { "adv/mean_abs_final_conf": 0.7521032094955444, "adv/mean_abs_reasoning": 0.32858970761299133, "adv/mean_abs_step_conf": 0.756147563457489, "adv/ratio_final_to_reasoning": 2.288882433229959, "adv/ratio_step_to_reasoning": 2.301190651863234, "adv/std_final_conf": 0.9297972321510315, "adv/std_reasoning": 0.6184530854225159, "adv/std_step_conf": 0.9357353448867798, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.43359375, "calib/ece": 0.07372549019607842, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.003072407045009773, "calib/mean_conf": 0.6594509803921569, "calib/mu_c": 0.6585714285714286, "calib/mu_w": 0.6616438356164384, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009725490196078435, "calib/std_conf": 0.04873792006210367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5836648558295028, "calib/step_q_c_n": 2393.0, "calib/step_q_gap": -0.004709905164761108, "calib/step_q_w": 0.5883747609942639, "calib/step_q_w_n": 1046.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 884.359375, "completions/mean_terminated_length": 887.8275146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.13226666666666667, "grad_norm": 0.18207412958145142, "kl": 0.050323486328125, "learning_rate": 2.138888888888889e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017641527578234673, "mask/share_reasoning": 0.8057651519775391, "mask/share_step_conf": 0.1726871132850647, "num_tokens": 37364794.0, "reward": 0.7761149406433105, "reward_std": 0.20994237065315247, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.786019504070282, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.42480406165122986, "step": 124 }, { "adv/mean_abs_final_conf": 0.7799336910247803, "adv/mean_abs_reasoning": 0.4838736057281494, "adv/mean_abs_step_conf": 0.7964974045753479, "adv/ratio_final_to_reasoning": 1.611854173883094, "adv/ratio_step_to_reasoning": 1.646085661929734, "adv/std_final_conf": 0.9312655329704285, "adv/std_reasoning": 0.7206544280052185, "adv/std_step_conf": 0.9362965822219849, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.15625, "calib/ece": 0.05118110236220468, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004841906559697962, "calib/mean_conf": 0.6553543307086616, "calib/mu_c": 0.6536196319018406, "calib/mu_w": 0.6584615384615385, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03240157480314956, "calib/std_conf": 0.051164321900868295, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5798667301285102, "calib/step_q_c_n": 2101.0, "calib/step_q_gap": -0.01134873948474946, "calib/step_q_w": 0.5912154696132597, "calib/step_q_w_n": 1267.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 860.84375, "completions/mean_terminated_length": 874.5079956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.13333333333333333, "grad_norm": 0.1974727362394333, "kl": 0.05866241455078125, "learning_rate": 2.1111111111111114e-06, "loss": -0.024, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01837814599275589, "mask/share_reasoning": 0.7949548959732056, "mask/share_step_conf": 0.17104199528694153, "num_tokens": 37689978.0, "reward": 0.6941776275634766, "reward_std": 0.24238252639770508, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.759081244468689, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3034926950931549, "step": 125 }, { "adv/mean_abs_final_conf": 0.7817037105560303, "adv/mean_abs_reasoning": 0.3521580100059509, "adv/mean_abs_step_conf": 0.7613109946250916, "adv/ratio_final_to_reasoning": 2.219752748326868, "adv/ratio_step_to_reasoning": 2.1618448906280068, "adv/std_final_conf": 0.9274759292602539, "adv/std_reasoning": 0.6187031865119934, "adv/std_step_conf": 0.9351398348808289, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.015625, "calib/ece": 0.15972222222222232, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03895386614684848, "calib/mean_conf": 0.6592460317460317, "calib/mu_c": 0.6467251461988305, "calib/mu_w": 0.685679012345679, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0701984126984127, "calib/std_conf": 0.046766761108402846, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5762054604349838, "calib/step_q_c_n": 2161.0, "calib/step_q_gap": -0.031425934099003516, "calib/step_q_w": 0.6076313945339873, "calib/step_q_w_n": 1427.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 861.546875, "completions/mean_terminated_length": 875.2222900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.1344, "grad_norm": 0.17764291167259216, "kl": 0.053813934326171875, "learning_rate": 2.0833333333333334e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018441149964928627, "mask/share_reasoning": 0.7864916920661926, "mask/share_step_conf": 0.1794421523809433, "num_tokens": 38015998.0, "reward": 0.7014974355697632, "reward_std": 0.21991148591041565, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7504230737686157, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.32210302352905273, "step": 126 }, { "adv/mean_abs_final_conf": 0.7877411842346191, "adv/mean_abs_reasoning": 0.3967254161834717, "adv/mean_abs_step_conf": 0.7814566493034363, "adv/ratio_final_to_reasoning": 1.985608060639897, "adv/ratio_step_to_reasoning": 1.969767041449242, "adv/std_final_conf": 0.9294294714927673, "adv/std_reasoning": 0.6613271832466125, "adv/std_step_conf": 0.9356931447982788, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.62109375, "calib/ece": 0.040963855421686735, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014924885241340835, "calib/mean_conf": 0.6660240963855422, "calib/mu_c": 0.6605696202531646, "calib/mu_w": 0.6754945054945054, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03622489959839359, "calib/std_conf": 0.0469282908233953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5867464114832537, "calib/step_q_c_n": 2090.0, "calib/step_q_gap": -0.022769619974701505, "calib/step_q_w": 0.6095160314579552, "calib/step_q_w_n": 1653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 843.4765625, "completions/mean_terminated_length": 863.7200317382812, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.13546666666666668, "grad_norm": 0.20979590713977814, "kl": 0.06250762939453125, "learning_rate": 2.0555555555555555e-06, "loss": -0.0176, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018223129212856293, "mask/share_reasoning": 0.7735735774040222, "mask/share_step_conf": 0.18476581573486328, "num_tokens": 38335600.0, "reward": 0.6690424680709839, "reward_std": 0.22137640416622162, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7372585535049438, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.28285762667655945, "step": 127 }, { "adv/mean_abs_final_conf": 0.7571334838867188, "adv/mean_abs_reasoning": 0.4741972088813782, "adv/mean_abs_step_conf": 0.768513560295105, "adv/ratio_final_to_reasoning": 1.5966637291534922, "adv/ratio_step_to_reasoning": 1.6206623444874617, "adv/std_final_conf": 0.9315160512924194, "adv/std_reasoning": 0.7392508387565613, "adv/std_step_conf": 0.9360662698745728, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.11328125, "calib/ece": 0.10455999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.026448308202871473, "calib/mean_conf": 0.6692, "calib/mu_c": 0.6604191616766466, "calib/mu_w": 0.6868674698795181, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05288, "calib/std_conf": 0.061788024729716035, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5889560439560441, "calib/step_q_c_n": 2184.0, "calib/step_q_gap": -0.013213305239197393, "calib/step_q_w": 0.6021693491952415, "calib/step_q_w_n": 1429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 896.26171875, "completions/mean_terminated_length": 921.4578247070312, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.13653333333333334, "grad_norm": 0.20551764965057373, "kl": 0.05535888671875, "learning_rate": 2.027777777777778e-06, "loss": -0.0794, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017915409058332443, "mask/share_reasoning": 0.778226375579834, "mask/share_step_conf": 0.17651450634002686, "num_tokens": 38671707.0, "reward": 0.6882283091545105, "reward_std": 0.2571631073951721, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7447984218597412, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.30587688088417053, "step": 128 }, { "adv/mean_abs_final_conf": 0.7544215321540833, "adv/mean_abs_reasoning": 0.33456581830978394, "adv/mean_abs_step_conf": 0.7508994340896606, "adv/ratio_final_to_reasoning": 2.254927105122087, "adv/ratio_step_to_reasoning": 2.244399735403877, "adv/std_final_conf": 0.9279298186302185, "adv/std_reasoning": 0.6184731721878052, "adv/std_step_conf": 0.9357430338859558, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.95703125, "calib/ece": 0.060222656250000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.0004651147143914791, "calib/mean_conf": 0.6550898437499999, "calib/mu_c": 0.6549281437125748, "calib/mu_w": 0.6553932584269663, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.031484374999999995, "calib/std_conf": 0.06971137297350115, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5895831784386617, "calib/step_q_c_n": 2152.0, "calib/step_q_gap": 0.0024415475373741335, "calib/step_q_w": 0.5871416309012876, "calib/step_q_w_n": 1165.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 811.8515625, "completions/mean_terminated_length": 818.2440795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.1376, "grad_norm": 0.19056949019432068, "kl": 0.05794525146484375, "learning_rate": 2.0000000000000003e-06, "loss": 0.057, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020177476108074188, "mask/share_reasoning": 0.7892353534698486, "mask/share_step_conf": 0.1827746033668518, "num_tokens": 38981925.0, "reward": 0.734876275062561, "reward_std": 0.20278804004192352, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7681304216384888, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.37115341424942017, "step": 129 }, { "adv/mean_abs_final_conf": 0.7722935676574707, "adv/mean_abs_reasoning": 0.13021612167358398, "adv/mean_abs_step_conf": 0.7471880912780762, "adv/ratio_final_to_reasoning": 5.930859848470978, "adv/ratio_step_to_reasoning": 5.738061322015651, "adv/std_final_conf": 0.9245293736457825, "adv/std_reasoning": 0.36968645453453064, "adv/std_step_conf": 0.9349712133407593, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.92578125, "calib/ece": 0.16717647058823515, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03459047764849976, "calib/mean_conf": 0.6572941176470589, "calib/mu_c": 0.6476630434782609, "calib/mu_w": 0.6822535211267606, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05145098039215685, "calib/std_conf": 0.044652599826664925, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5745742205677059, "calib/step_q_c_n": 2149.0, "calib/step_q_gap": -0.02470164150125953, "calib/step_q_w": 0.5992758620689654, "calib/step_q_w_n": 1160.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 777.61328125, "completions/mean_terminated_length": 783.7362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.13866666666666666, "grad_norm": 0.1187925636768341, "kl": 0.05173492431640625, "learning_rate": 1.9722222222222224e-06, "loss": -0.0046, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020093917846679688, "mask/share_reasoning": 0.7792434096336365, "mask/share_step_conf": 0.19285017251968384, "num_tokens": 39286282.0, "reward": 0.7898828387260437, "reward_std": 0.12785649299621582, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7760253548622131, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.46077150106430054, "step": 130 }, { "adv/mean_abs_final_conf": 0.7504079937934875, "adv/mean_abs_reasoning": 0.3435615301132202, "adv/mean_abs_step_conf": 0.7561184167861938, "adv/ratio_final_to_reasoning": 2.1842026188036585, "adv/ratio_step_to_reasoning": 2.2008238714532915, "adv/std_final_conf": 0.9309230446815491, "adv/std_reasoning": 0.6185750365257263, "adv/std_step_conf": 0.9353523850440979, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.64453125, "calib/ece": 0.219763779527559, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.019634054325955552, "calib/mean_conf": 0.6607086614173228, "calib/mu_c": 0.6497321428571429, "calib/mu_w": 0.6693661971830984, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.219763779527559, "calib/std_conf": 0.05030898390931509, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5784447572132302, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": -0.015160455141982165, "calib/step_q_w": 0.5936052123552124, "calib/step_q_w_n": 2072.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 814.8046875, "completions/mean_terminated_length": 821.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.13973333333333332, "grad_norm": 0.1584021896123886, "kl": 0.0573883056640625, "learning_rate": 1.944444444444445e-06, "loss": -0.0395, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0187082439661026, "mask/share_reasoning": 0.7880691885948181, "mask/share_step_conf": 0.1854100525379181, "num_tokens": 39601080.0, "reward": 0.5554240942001343, "reward_std": 0.20415136218070984, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6875663995742798, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.13734431564807892, "step": 131 }, { "adv/mean_abs_final_conf": 0.72148597240448, "adv/mean_abs_reasoning": 0.3808108866214752, "adv/mean_abs_step_conf": 0.7539358139038086, "adv/ratio_final_to_reasoning": 1.8946043764805356, "adv/ratio_step_to_reasoning": 1.9798168602601383, "adv/std_final_conf": 0.9284760355949402, "adv/std_reasoning": 0.6814852356910706, "adv/std_step_conf": 0.9356449246406555, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.1640625, "calib/ece": 0.14043137254901958, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0017073170731704002, "calib/mean_conf": 0.6653725490196077, "calib/mu_c": 0.6657073170731705, "calib/mu_w": 0.6640000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0009411764705882361, "calib/std_conf": 0.051317527787378664, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5865436746987952, "calib/step_q_c_n": 2656.0, "calib/step_q_gap": -0.0016776138166110233, "calib/step_q_w": 0.5882212885154062, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 823.26171875, "completions/mean_terminated_length": 829.7440795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.1408, "grad_norm": 0.22911450266838074, "kl": 0.0549163818359375, "learning_rate": 1.916666666666667e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019517943263053894, "mask/share_reasoning": 0.7815057039260864, "mask/share_step_conf": 0.1911638379096985, "num_tokens": 39917427.0, "reward": 0.798399806022644, "reward_std": 0.22343795001506805, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8178699016571045, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4195547103881836, "step": 132 }, { "adv/mean_abs_final_conf": 0.7569185495376587, "adv/mean_abs_reasoning": 0.38095924258232117, "adv/mean_abs_step_conf": 0.7612114548683167, "adv/ratio_final_to_reasoning": 1.9868754053764603, "adv/ratio_step_to_reasoning": 1.998144078900585, "adv/std_final_conf": 0.9316318035125732, "adv/std_reasoning": 0.6611606478691101, "adv/std_step_conf": 0.9360270500183105, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 14.84765625, "calib/ece": 0.09964843749999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.004250904245193254, "calib/mean_conf": 0.6838671875000001, "calib/mu_c": 0.682156862745098, "calib/mu_w": 0.6864077669902913, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0929296875, "calib/std_conf": 0.057915330317972316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5956744604316547, "calib/step_q_c_n": 2224.0, "calib/step_q_gap": -0.002511969498592559, "calib/step_q_w": 0.5981864299302473, "calib/step_q_w_n": 1577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 996.5234375, "completions/mean_terminated_length": 1004.3700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.14186666666666667, "grad_norm": 0.1763768196105957, "kl": 0.04608917236328125, "learning_rate": 1.888888888888889e-06, "loss": 0.0136, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.015577636659145355, "mask/share_reasoning": 0.8027868270874023, "mask/share_step_conf": 0.17382307350635529, "num_tokens": 40278881.0, "reward": 0.6586554050445557, "reward_std": 0.2522858679294586, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7467058897018433, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.2510736882686615, "step": 133 }, { "adv/mean_abs_final_conf": 0.7732973098754883, "adv/mean_abs_reasoning": 0.3910679221153259, "adv/mean_abs_step_conf": 0.7673285007476807, "adv/ratio_final_to_reasoning": 1.977398978910479, "adv/ratio_step_to_reasoning": 1.9621361337875098, "adv/std_final_conf": 0.9308413863182068, "adv/std_reasoning": 0.6612193584442139, "adv/std_step_conf": 0.9361827969551086, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 15.3203125, "calib/ece": 0.08171874999999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.009156626506024002, "calib/mean_conf": 0.6840625, "calib/mu_c": 0.680843373493976, "calib/mu_w": 0.69, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05867187499999997, "calib/std_conf": 0.060135751377612304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.599133252328878, "calib/step_q_c_n": 2469.0, "calib/step_q_gap": 0.0017210018127046256, "calib/step_q_w": 0.5974122505161734, "calib/step_q_w_n": 1453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 985.46875, "completions/mean_terminated_length": 993.2283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.14293333333333333, "grad_norm": 0.21439006924629211, "kl": 0.04900360107421875, "learning_rate": 1.8611111111111113e-06, "loss": -0.009, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.016259362921118736, "mask/share_reasoning": 0.796444833278656, "mask/share_step_conf": 0.1794833391904831, "num_tokens": 40640113.0, "reward": 0.6827579140663147, "reward_std": 0.2516036629676819, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.762973427772522, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.27285486459732056, "step": 134 }, { "adv/mean_abs_final_conf": 0.7396765351295471, "adv/mean_abs_reasoning": 0.3181406259536743, "adv/mean_abs_step_conf": 0.7739882469177246, "adv/ratio_final_to_reasoning": 2.324998679160373, "adv/ratio_step_to_reasoning": 2.4328494501372737, "adv/std_final_conf": 0.929745614528656, "adv/std_reasoning": 0.6185531616210938, "adv/std_step_conf": 0.935878574848175, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.94921875, "calib/ece": 0.07687747035573127, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.000853842669485072, "calib/mean_conf": 0.6714229249011858, "calib/mu_c": 0.6711731843575419, "calib/mu_w": 0.672027027027027, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.020395256916996046, "calib/std_conf": 0.054202363724677044, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5869275603663614, "calib/step_q_c_n": 2402.0, "calib/step_q_gap": -0.011207597888557408, "calib/step_q_w": 0.5981351582549188, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 904.71484375, "completions/mean_terminated_length": 915.4427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.144, "grad_norm": 0.149041548371315, "kl": 0.05022430419921875, "learning_rate": 1.8333333333333333e-06, "loss": -0.0673, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.017318157479166985, "mask/share_reasoning": 0.7906175851821899, "mask/share_step_conf": 0.1803455352783203, "num_tokens": 40977600.0, "reward": 0.7551214694976807, "reward_std": 0.20652639865875244, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7792269587516785, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3935159742832184, "step": 135 }, { "adv/mean_abs_final_conf": 0.7377204895019531, "adv/mean_abs_reasoning": 0.4371660351753235, "adv/mean_abs_step_conf": 0.7849434614181519, "adv/ratio_final_to_reasoning": 1.6875064166549298, "adv/ratio_step_to_reasoning": 1.7955270955653126, "adv/std_final_conf": 0.9307897090911865, "adv/std_reasoning": 0.7205690145492554, "adv/std_step_conf": 0.9362493753433228, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.6484375, "calib/ece": 0.12819999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02222403764459846, "calib/mean_conf": 0.6714, "calib/mu_c": 0.6618881118881118, "calib/mu_w": 0.6841121495327103, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11379999999999994, "calib/std_conf": 0.051660816873138976, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5871563483735571, "calib/step_q_c_n": 1906.0, "calib/step_q_gap": -0.01733389023815657, "calib/step_q_w": 0.6044902386117137, "calib/step_q_w_n": 1844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 855.8671875, "completions/mean_terminated_length": 876.4080200195312, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.14506666666666668, "grad_norm": 0.21199779212474823, "kl": 0.05721282958984375, "learning_rate": 1.8055555555555557e-06, "loss": -0.1456, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018238719552755356, "mask/share_reasoning": 0.7721179723739624, "mask/share_step_conf": 0.18620575964450836, "num_tokens": 41305190.0, "reward": 0.6414843201637268, "reward_std": 0.22870557010173798, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7146027088165283, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.26133468747138977, "step": 136 }, { "adv/mean_abs_final_conf": 0.7521461248397827, "adv/mean_abs_reasoning": 0.2468286156654358, "adv/mean_abs_step_conf": 0.7700768709182739, "adv/ratio_final_to_reasoning": 3.047240380990834, "adv/ratio_step_to_reasoning": 3.11988490006392, "adv/std_final_conf": 0.9292258620262146, "adv/std_reasoning": 0.5483195185661316, "adv/std_step_conf": 0.9359169602394104, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 15.21484375, "calib/ece": 0.07782608695652168, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": 0.006240981240981269, "calib/mean_conf": 0.6781422924901186, "calib/mu_c": 0.6805844155844156, "calib/mu_w": 0.6743434343434344, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07363636363636357, "calib/std_conf": 0.05971695019170884, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.597585152838428, "calib/step_q_c_n": 2290.0, "calib/step_q_gap": -0.00749584404630721, "calib/step_q_w": 0.6050809968847352, "calib/step_q_w_n": 1605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 930.125, "completions/mean_terminated_length": 937.4487915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.14613333333333334, "grad_norm": 0.3232787549495697, "kl": 0.0700531005859375, "learning_rate": 1.777777777777778e-06, "loss": -0.0455, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.016633665189146996, "mask/share_reasoning": 0.793748676776886, "mask/share_step_conf": 0.18180516362190247, "num_tokens": 41650286.0, "reward": 0.7173147201538086, "reward_std": 0.18486090004444122, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7475347518920898, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3691259026527405, "step": 137 }, { "adv/mean_abs_final_conf": 0.7692033052444458, "adv/mean_abs_reasoning": 0.26124483346939087, "adv/mean_abs_step_conf": 0.7495217323303223, "adv/ratio_final_to_reasoning": 2.944377100320993, "adv/ratio_step_to_reasoning": 2.8690394461643622, "adv/std_final_conf": 0.9280076622962952, "adv/std_reasoning": 0.5482669472694397, "adv/std_step_conf": 0.935155987739563, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.046875, "calib/ece": 0.05815686274509814, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.023529411764705882, "calib/gap": 0.019387522768670284, "calib/mean_conf": 0.6696078431372549, "calib/mu_c": 0.6750819672131148, "calib/mu_w": 0.6556944444444445, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.005058823529411749, "calib/std_conf": 0.07222407355743857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6015701515740381, "calib/step_q_c_n": 2573.0, "calib/step_q_gap": 0.0005046579278993324, "calib/step_q_w": 0.6010654936461388, "calib/step_q_w_n": 1023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 862.86328125, "completions/mean_terminated_length": 866.2471313476562, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.1472, "grad_norm": 0.157817080616951, "kl": 0.058624267578125, "learning_rate": 1.75e-06, "loss": 0.0231, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019831445068120956, "mask/share_reasoning": 0.7864515781402588, "mask/share_step_conf": 0.18981072306632996, "num_tokens": 41975515.0, "reward": 0.7384357452392578, "reward_std": 0.19949795305728912, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7945871353149414, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.34009692072868347, "step": 138 }, { "adv/mean_abs_final_conf": 0.7381882071495056, "adv/mean_abs_reasoning": 0.3053765892982483, "adv/mean_abs_step_conf": 0.7476175427436829, "adv/ratio_final_to_reasoning": 2.417304511933456, "adv/ratio_step_to_reasoning": 2.448182241021484, "adv/std_final_conf": 0.9278108477592468, "adv/std_reasoning": 0.5961070656776428, "adv/std_step_conf": 0.9348329901695251, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.234375, "calib/ece": 0.10700787401574796, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007726075504828844, "calib/mean_conf": 0.6596850393700787, "calib/mu_c": 0.6576470588235295, "calib/mu_w": 0.6653731343283583, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.015236220472440934, "calib/std_conf": 0.04211788869154169, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583914309484193, "calib/step_q_c_n": 2404.0, "calib/step_q_gap": -0.00866699132881521, "calib/step_q_w": 0.5925813008130082, "calib/step_q_w_n": 984.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 805.84765625, "completions/mean_terminated_length": 812.1929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.14826666666666666, "grad_norm": 0.19154296815395355, "kl": 0.05483245849609375, "learning_rate": 1.7222222222222224e-06, "loss": -0.0483, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019182808697223663, "mask/share_reasoning": 0.7837511301040649, "mask/share_step_conf": 0.18925350904464722, "num_tokens": 42284908.0, "reward": 0.7721434831619263, "reward_std": 0.18229490518569946, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7868027687072754, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.4137341380119324, "step": 139 }, { "adv/mean_abs_final_conf": 0.7774754166603088, "adv/mean_abs_reasoning": 0.32294416427612305, "adv/mean_abs_step_conf": 0.7844225168228149, "adv/ratio_final_to_reasoning": 2.407460801786012, "adv/ratio_step_to_reasoning": 2.428972570478529, "adv/std_final_conf": 0.9279150366783142, "adv/std_reasoning": 0.5960806608200073, "adv/std_step_conf": 0.9359557628631592, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.31640625, "calib/ece": 0.12937254901960785, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02518849206349194, "calib/mean_conf": 0.6697647058823528, "calib/mu_c": 0.6635416666666667, "calib/mu_w": 0.6887301587301586, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02309803921568629, "calib/std_conf": 0.0580596678326277, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5849558638083229, "calib/step_q_c_n": 2379.0, "calib/step_q_gap": -0.017180058521774177, "calib/step_q_w": 0.602135922330097, "calib/step_q_w_n": 1030.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 872.99609375, "completions/mean_terminated_length": 879.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.14933333333333335, "grad_norm": 0.16991457343101501, "kl": 0.0538330078125, "learning_rate": 1.6944444444444446e-06, "loss": 0.0077, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01823488250374794, "mask/share_reasoning": 0.790435791015625, "mask/share_step_conf": 0.18351686000823975, "num_tokens": 42613411.0, "reward": 0.7748196721076965, "reward_std": 0.20988968014717102, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7912160158157349, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4092045724391937, "step": 140 }, { "adv/mean_abs_final_conf": 0.7478249073028564, "adv/mean_abs_reasoning": 0.30096516013145447, "adv/mean_abs_step_conf": 0.7793318033218384, "adv/ratio_final_to_reasoning": 2.4847557337740493, "adv/ratio_step_to_reasoning": 2.5894419240467723, "adv/std_final_conf": 0.9278432726860046, "adv/std_reasoning": 0.5961489677429199, "adv/std_step_conf": 0.9352448582649231, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 15.125, "calib/ece": 0.10784000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.00848232848232855, "calib/mean_conf": 0.6748, "calib/mu_c": 0.6725945945945946, "calib/mu_w": 0.6810769230769231, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02132000000000002, "calib/std_conf": 0.06133970981346424, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5923554356206631, "calib/step_q_c_n": 2594.0, "calib/step_q_gap": -0.02778540944975949, "calib/step_q_w": 0.6201408450704226, "calib/step_q_w_n": 1278.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 946.01953125, "completions/mean_terminated_length": 964.8645629882812, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.1504, "grad_norm": 0.1529771089553833, "kl": 0.04474639892578125, "learning_rate": 1.6666666666666667e-06, "loss": -0.0675, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016732092946767807, "mask/share_reasoning": 0.7889929413795471, "mask/share_step_conf": 0.17474375665187836, "num_tokens": 42962688.0, "reward": 0.7419514656066895, "reward_std": 0.19250836968421936, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7776585817337036, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3664005696773529, "step": 141 }, { "adv/mean_abs_final_conf": 0.7602142095565796, "adv/mean_abs_reasoning": 0.3405728042125702, "adv/mean_abs_step_conf": 0.7700763940811157, "adv/ratio_final_to_reasoning": 2.2321635789864422, "adv/ratio_step_to_reasoning": 2.261121218594039, "adv/std_final_conf": 0.9295332431793213, "adv/std_reasoning": 0.6185359358787537, "adv/std_step_conf": 0.936034083366394, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.9453125, "calib/ece": 0.10357142857142856, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.011647058823529344, "calib/mean_conf": 0.6673809523809524, "calib/mu_c": 0.6626666666666666, "calib/mu_w": 0.674313725490196, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08785714285714286, "calib/std_conf": 0.05888081277783693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5899589953869809, "calib/step_q_c_n": 1951.0, "calib/step_q_gap": -0.013444340005236421, "calib/step_q_w": 0.6034033353922174, "calib/step_q_w_n": 1619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 902.99609375, "completions/mean_terminated_length": 920.9840698242188, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.15146666666666667, "grad_norm": 0.1717846393585205, "kl": 0.04624176025390625, "learning_rate": 1.638888888888889e-06, "loss": -0.0152, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01820656657218933, "mask/share_reasoning": 0.7857927680015564, "mask/share_step_conf": 0.17646938562393188, "num_tokens": 43299015.0, "reward": 0.6462733745574951, "reward_std": 0.21243321895599365, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7331492304801941, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.24533499777317047, "step": 142 }, { "adv/mean_abs_final_conf": 0.75215083360672, "adv/mean_abs_reasoning": 0.325761616230011, "adv/mean_abs_step_conf": 0.7678713202476501, "adv/ratio_final_to_reasoning": 2.3088995023761414, "adv/ratio_step_to_reasoning": 2.3571571418821735, "adv/std_final_conf": 0.9295081496238708, "adv/std_reasoning": 0.618578314781189, "adv/std_step_conf": 0.935473620891571, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.97265625, "calib/ece": 0.16708661417322834, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.03146934988938843, "calib/mean_conf": 0.6662204724409448, "calib/mu_c": 0.6561849710982659, "calib/mu_w": 0.6876543209876543, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07610236220472441, "calib/std_conf": 0.054439580521306874, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5799955015744489, "calib/step_q_c_n": 2223.0, "calib/step_q_gap": -0.026939505811075515, "calib/step_q_w": 0.6069350073855244, "calib/step_q_w_n": 1354.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 894.27734375, "completions/mean_terminated_length": 904.8814697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.15253333333333333, "grad_norm": 0.4571000337600708, "kl": 0.05872344970703125, "learning_rate": 1.6111111111111113e-06, "loss": -0.0381, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.017354609444737434, "mask/share_reasoning": 0.795860767364502, "mask/share_step_conf": 0.17506583034992218, "num_tokens": 43635286.0, "reward": 0.7234030365943909, "reward_std": 0.22833162546157837, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7575539350509644, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.35643959045410156, "step": 143 }, { "adv/mean_abs_final_conf": 0.7369581460952759, "adv/mean_abs_reasoning": 0.28275996446609497, "adv/mean_abs_step_conf": 0.71214759349823, "adv/ratio_final_to_reasoning": 2.6063030085846637, "adv/ratio_step_to_reasoning": 2.518558788345094, "adv/std_final_conf": 0.9275577664375305, "adv/std_reasoning": 0.5959152579307556, "adv/std_step_conf": 0.9351029396057129, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.6796875, "calib/ece": 0.14607843137254897, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.006871091871091584, "calib/mean_conf": 0.658392156862745, "calib/mu_c": 0.6566137566137568, "calib/mu_w": 0.6634848484848483, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0316470588235294, "calib/std_conf": 0.05160334326886597, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5831010752688173, "calib/step_q_c_n": 2325.0, "calib/step_q_gap": -0.006204027879934126, "calib/step_q_w": 0.5893051031487514, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 842.84375, "completions/mean_terminated_length": 849.4802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.1536, "grad_norm": 0.17217902839183807, "kl": 0.05643463134765625, "learning_rate": 1.5833333333333333e-06, "loss": -0.0481, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019316058605909348, "mask/share_reasoning": 0.7909666299819946, "mask/share_step_conf": 0.18190476298332214, "num_tokens": 43955182.0, "reward": 0.7673730850219727, "reward_std": 0.15579378604888916, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7929043173789978, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3949667513370514, "step": 144 }, { "adv/mean_abs_final_conf": 0.719872236251831, "adv/mean_abs_reasoning": 0.396770179271698, "adv/mean_abs_step_conf": 0.736086368560791, "adv/ratio_final_to_reasoning": 1.8143304962414555, "adv/ratio_step_to_reasoning": 1.855195795994381, "adv/std_final_conf": 0.9305780529975891, "adv/std_reasoning": 0.7011988162994385, "adv/std_step_conf": 0.9361504316329956, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.390625, "calib/ece": 0.07204724409448815, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.011173909940769278, "calib/mean_conf": 0.674488188976378, "calib/mu_c": 0.6709248554913294, "calib/mu_w": 0.6820987654320987, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03271653543307088, "calib/std_conf": 0.06198435999606487, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5919049604001667, "calib/step_q_c_n": 2399.0, "calib/step_q_gap": -0.01571371664263488, "calib/step_q_w": 0.6076186770428016, "calib/step_q_w_n": 1285.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 879.0859375, "completions/mean_terminated_length": 889.5098876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.15466666666666667, "grad_norm": 0.17100566625595093, "kl": 0.05695343017578125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0269, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018207937479019165, "mask/share_reasoning": 0.7837401032447815, "mask/share_step_conf": 0.18633320927619934, "num_tokens": 44282932.0, "reward": 0.6938064098358154, "reward_std": 0.24391242861747742, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7680108547210693, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.28600814938545227, "step": 145 }, { "adv/mean_abs_final_conf": 0.7830526828765869, "adv/mean_abs_reasoning": 0.31210857629776, "adv/mean_abs_step_conf": 0.7571125626564026, "adv/ratio_final_to_reasoning": 2.50891113651915, "adv/ratio_step_to_reasoning": 2.4257986487820724, "adv/std_final_conf": 0.9304502606391907, "adv/std_reasoning": 0.5727735757827759, "adv/std_step_conf": 0.9361012578010559, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.39453125, "calib/ece": 0.08645669291338594, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.009376797698945394, "calib/mean_conf": 0.6730708661417322, "calib/mu_c": 0.6691946308724832, "calib/mu_w": 0.6785714285714286, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08645669291338594, "calib/std_conf": 0.06035996758325317, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5853272532188841, "calib/step_q_c_n": 1864.0, "calib/step_q_gap": -0.01265357745204243, "calib/step_q_w": 0.5979808306709266, "calib/step_q_w_n": 1565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 899.69921875, "completions/mean_terminated_length": 910.3676147460938, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.15573333333333333, "grad_norm": 0.15870602428913116, "kl": 0.05303192138671875, "learning_rate": 1.527777777777778e-06, "loss": -0.0074, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01740969344973564, "mask/share_reasoning": 0.7895663380622864, "mask/share_step_conf": 0.1813051998615265, "num_tokens": 44620471.0, "reward": 0.6407595872879028, "reward_std": 0.2192278802394867, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7360406517982483, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.230634868144989, "step": 146 }, { "adv/mean_abs_final_conf": 0.7577475309371948, "adv/mean_abs_reasoning": 0.26077109575271606, "adv/mean_abs_step_conf": 0.7750769257545471, "adv/ratio_final_to_reasoning": 2.905795708492752, "adv/ratio_step_to_reasoning": 2.972250139599585, "adv/std_final_conf": 0.9285784959793091, "adv/std_reasoning": 0.548322319984436, "adv/std_step_conf": 0.9357059001922607, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 14.48828125, "calib/ece": 0.06952000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.004, "calib/gap": -0.014648556876061036, "calib/mean_conf": 0.67376, "calib/mu_c": 0.6681935483870968, "calib/mu_w": 0.6828421052631578, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06164000000000003, "calib/std_conf": 0.05776731255649686, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5893780369290573, "calib/step_q_c_n": 2058.0, "calib/step_q_gap": -0.020761272580330892, "calib/step_q_w": 0.6101393095093882, "calib/step_q_w_n": 1651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 899.05078125, "completions/mean_terminated_length": 916.960205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 275.0, "epoch": 0.1568, "grad_norm": 0.15729469060897827, "kl": 0.0526580810546875, "learning_rate": 1.5e-06, "loss": -0.0615, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017730841413140297, "mask/share_reasoning": 0.7814549207687378, "mask/share_step_conf": 0.18128295242786407, "num_tokens": 44954308.0, "reward": 0.6083645820617676, "reward_std": 0.19842839241027832, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.733662486076355, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.1666603982448578, "step": 147 }, { "adv/mean_abs_final_conf": 0.7911767363548279, "adv/mean_abs_reasoning": 0.23125512897968292, "adv/mean_abs_step_conf": 0.7521365880966187, "adv/ratio_final_to_reasoning": 3.4212289251510493, "adv/ratio_step_to_reasoning": 3.252410406701501, "adv/std_final_conf": 0.9272419810295105, "adv/std_reasoning": 0.49597135186195374, "adv/std_step_conf": 0.9352139234542847, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.7734375, "calib/ece": 0.1997265625000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03117959949937421, "calib/mean_conf": 0.6663671875, "calib/mu_c": 0.6580851063829787, "calib/mu_w": 0.6892647058823529, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06585937500000005, "calib/std_conf": 0.058340376655793405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5856148208469055, "calib/step_q_c_n": 2456.0, "calib/step_q_gap": -0.012441253919449613, "calib/step_q_w": 0.5980560747663551, "calib/step_q_w_n": 1070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 830.12890625, "completions/mean_terminated_length": 836.6653442382812, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.15786666666666666, "grad_norm": 0.1504538506269455, "kl": 0.0512542724609375, "learning_rate": 1.4722222222222225e-06, "loss": 0.0055, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02039681002497673, "mask/share_reasoning": 0.7797145247459412, "mask/share_step_conf": 0.192076176404953, "num_tokens": 45271933.0, "reward": 0.7831298112869263, "reward_std": 0.1646493375301361, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7847386598587036, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.43464604020118713, "step": 148 }, { "adv/mean_abs_final_conf": 0.7535303831100464, "adv/mean_abs_reasoning": 0.2745121121406555, "adv/mean_abs_step_conf": 0.7678822875022888, "adv/ratio_final_to_reasoning": 2.7449804572701324, "adv/ratio_step_to_reasoning": 2.797261955089393, "adv/std_final_conf": 0.9277353882789612, "adv/std_reasoning": 0.5726152062416077, "adv/std_step_conf": 0.9349270462989807, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.31640625, "calib/ece": 0.09976377952755902, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.023622047244094488, "calib/gap": -0.027439775910364173, "calib/mean_conf": 0.674015748031496, "calib/mu_c": 0.6649411764705881, "calib/mu_w": 0.6923809523809523, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05224409448818898, "calib/std_conf": 0.07041787356455628, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5885290208241029, "calib/step_q_c_n": 2257.0, "calib/step_q_gap": -0.023530638266806236, "calib/step_q_w": 0.6120596590909091, "calib/step_q_w_n": 1408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 945.51171875, "completions/mean_terminated_length": 952.9566650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.15893333333333334, "grad_norm": 0.1864592432975769, "kl": 0.0453643798828125, "learning_rate": 1.4444444444444445e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.017488349229097366, "mask/share_reasoning": 0.7987071871757507, "mask/share_step_conf": 0.175991952419281, "num_tokens": 45618440.0, "reward": 0.7075504660606384, "reward_std": 0.17754337191581726, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7555820345878601, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3282688856124878, "step": 149 }, { "adv/mean_abs_final_conf": 0.7613473534584045, "adv/mean_abs_reasoning": 0.17415973544120789, "adv/mean_abs_step_conf": 0.7704967260360718, "adv/ratio_final_to_reasoning": 4.371546336641152, "adv/ratio_step_to_reasoning": 4.424080710068447, "adv/std_final_conf": 0.924812912940979, "adv/std_reasoning": 0.4675443768501282, "adv/std_step_conf": 0.9354718327522278, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.4375, "calib/ece": 0.09329411764705892, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": -0.01556890103567321, "calib/mean_conf": 0.6520392156862744, "calib/mu_c": 0.6472159090909091, "calib/mu_w": 0.6627848101265823, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027568627450980397, "calib/std_conf": 0.05854241324377105, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5801556603773586, "calib/step_q_c_n": 2120.0, "calib/step_q_gap": -0.010671407291814372, "calib/step_q_w": 0.5908270676691729, "calib/step_q_w_n": 1064.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 804.04296875, "completions/mean_terminated_length": 810.3740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 250.0, "epoch": 0.16, "grad_norm": 0.11460676789283752, "kl": 0.055023193359375, "learning_rate": 1.4166666666666667e-06, "loss": -0.0341, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02103450521826744, "mask/share_reasoning": 0.7882153987884521, "mask/share_step_conf": 0.1829375922679901, "num_tokens": 45929235.0, "reward": 0.7221521735191345, "reward_std": 0.14214414358139038, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7698402404785156, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.33852654695510864, "step": 150 }, { "adv/mean_abs_final_conf": 0.768987774848938, "adv/mean_abs_reasoning": 0.28910064697265625, "adv/mean_abs_step_conf": 0.7840760946273804, "adv/ratio_final_to_reasoning": 2.6599310063864037, "adv/ratio_step_to_reasoning": 2.7121215494946296, "adv/std_final_conf": 0.9297264218330383, "adv/std_reasoning": 0.5726152062416077, "adv/std_step_conf": 0.9357607960700989, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.69140625, "calib/ece": 0.0995256916996047, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007418688693856423, "calib/mean_conf": 0.6748616600790515, "calib/mu_c": 0.6718120805369127, "calib/mu_w": 0.6792307692307691, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09272727272727267, "calib/std_conf": 0.06690563699177939, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5945520934761441, "calib/step_q_c_n": 2054.0, "calib/step_q_gap": -0.00992945309679083, "calib/step_q_w": 0.604481546572935, "calib/step_q_w_n": 1707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 924.71875, "completions/mean_terminated_length": 939.3968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.16106666666666666, "grad_norm": 0.19589243829250336, "kl": 0.0519866943359375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0981, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017413543537259102, "mask/share_reasoning": 0.7887399196624756, "mask/share_step_conf": 0.17822150886058807, "num_tokens": 46272987.0, "reward": 0.6662525534629822, "reward_std": 0.18551214039325714, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7337561845779419, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2846863567829132, "step": 151 }, { "adv/mean_abs_final_conf": 0.7441304922103882, "adv/mean_abs_reasoning": 0.45768579840660095, "adv/mean_abs_step_conf": 0.7441684007644653, "adv/ratio_final_to_reasoning": 1.6258544503697145, "adv/ratio_step_to_reasoning": 1.6259372769599412, "adv/std_final_conf": 0.9302271008491516, "adv/std_reasoning": 0.7392387986183167, "adv/std_step_conf": 0.9360255599021912, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.0703125, "calib/ece": 0.12772908366533864, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00796812749003984, "calib/gap": -0.03468060052987931, "calib/mean_conf": 0.6623107569721116, "calib/mu_c": 0.6513953488372093, "calib/mu_w": 0.6860759493670886, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05239043824701195, "calib/std_conf": 0.06678322429797894, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5809782082324455, "calib/step_q_c_n": 2065.0, "calib/step_q_gap": -0.02828799005014626, "calib/step_q_w": 0.6092661982825918, "calib/step_q_w_n": 1281.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 864.37890625, "completions/mean_terminated_length": 878.0992431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.16213333333333332, "grad_norm": 0.21392974257469177, "kl": 0.05400848388671875, "learning_rate": 1.3611111111111112e-06, "loss": -0.0955, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019230522215366364, "mask/share_reasoning": 0.7847087383270264, "mask/share_step_conf": 0.18043574690818787, "num_tokens": 46599660.0, "reward": 0.7458244562149048, "reward_std": 0.2385040670633316, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7494453191757202, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.41173478960990906, "step": 152 }, { "adv/mean_abs_final_conf": 0.7392740249633789, "adv/mean_abs_reasoning": 0.22879235446453094, "adv/mean_abs_step_conf": 0.7788934707641602, "adv/ratio_final_to_reasoning": 3.2312007396120683, "adv/ratio_step_to_reasoning": 3.4043684396145757, "adv/std_final_conf": 0.9284136891365051, "adv/std_reasoning": 0.5227508544921875, "adv/std_step_conf": 0.9354473948478699, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.27734375, "calib/ece": 0.10626984126984128, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.031333333333333435, "calib/mean_conf": 0.6703968253968254, "calib/mu_c": 0.6614444444444444, "calib/mu_w": 0.6927777777777778, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03119047619047619, "calib/std_conf": 0.05615879969846835, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.582473164448261, "calib/step_q_c_n": 2329.0, "calib/step_q_gap": -0.038959716396384514, "calib/step_q_w": 0.6214328808446455, "calib/step_q_w_n": 1326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 937.6171875, "completions/mean_terminated_length": 956.2948608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.1632, "grad_norm": 0.12315616756677628, "kl": 0.0469512939453125, "learning_rate": 1.3333333333333334e-06, "loss": -0.0565, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.016671668738126755, "mask/share_reasoning": 0.7928116321563721, "mask/share_step_conf": 0.17098543047904968, "num_tokens": 46947010.0, "reward": 0.7480127811431885, "reward_std": 0.17629170417785645, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7658921480178833, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3926334083080292, "step": 153 }, { "adv/mean_abs_final_conf": 0.7574979066848755, "adv/mean_abs_reasoning": 0.3248511850833893, "adv/mean_abs_step_conf": 0.7860252857208252, "adv/ratio_final_to_reasoning": 2.3318305164576385, "adv/ratio_step_to_reasoning": 2.419647277934518, "adv/std_final_conf": 0.9292215704917908, "adv/std_reasoning": 0.6185224056243896, "adv/std_step_conf": 0.9357133507728577, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.3125, "calib/ece": 0.0781746031746032, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.00029931972789110084, "calib/mean_conf": 0.6615079365079365, "calib/mu_c": 0.6616326530612244, "calib/mu_w": 0.6613333333333333, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0781746031746032, "calib/std_conf": 0.042380878069015165, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5797038233710287, "calib/step_q_c_n": 1857.0, "calib/step_q_gap": -0.019696563476166773, "calib/step_q_w": 0.5994003868471954, "calib/step_q_w_n": 1551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 851.70703125, "completions/mean_terminated_length": 865.2262573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.16426666666666667, "grad_norm": 0.16613580286502838, "kl": 0.05309295654296875, "learning_rate": 1.3055555555555556e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017889706417918205, "mask/share_reasoning": 0.7869839668273926, "mask/share_step_conf": 0.17950135469436646, "num_tokens": 47269487.0, "reward": 0.633529007434845, "reward_std": 0.22433821856975555, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7374765276908875, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.21786263585090637, "step": 154 }, { "adv/mean_abs_final_conf": 0.7702876925468445, "adv/mean_abs_reasoning": 0.42663636803627014, "adv/mean_abs_step_conf": 0.7740648984909058, "adv/ratio_final_to_reasoning": 1.8054899916112146, "adv/ratio_step_to_reasoning": 1.8143434467478385, "adv/std_final_conf": 0.9317903518676758, "adv/std_reasoning": 0.7012701034545898, "adv/std_step_conf": 0.9358920454978943, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.796875, "calib/ece": 0.08015624999999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014206773618538282, "calib/mean_conf": 0.664296875, "calib/mu_c": 0.6586363636363637, "calib/mu_w": 0.672843137254902, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07144531249999997, "calib/std_conf": 0.054085921136968494, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5821870286576171, "calib/step_q_c_n": 1989.0, "calib/step_q_gap": -0.010321072444132828, "calib/step_q_w": 0.5925081011017499, "calib/step_q_w_n": 1543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 835.6328125, "completions/mean_terminated_length": 842.2125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.16533333333333333, "grad_norm": 0.20822681486606598, "kl": 0.0546417236328125, "learning_rate": 1.2777777777777779e-06, "loss": -0.0081, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018709741532802582, "mask/share_reasoning": 0.788835883140564, "mask/share_step_conf": 0.18464188277721405, "num_tokens": 47590625.0, "reward": 0.6713146567344666, "reward_std": 0.2506943941116333, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7466437816619873, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.2756730318069458, "step": 155 }, { "adv/mean_abs_final_conf": 0.7726517915725708, "adv/mean_abs_reasoning": 0.24603010714054108, "adv/mean_abs_step_conf": 0.7691492438316345, "adv/ratio_final_to_reasoning": 3.140476588628256, "adv/ratio_step_to_reasoning": 3.1262403320105427, "adv/std_final_conf": 0.9284862875938416, "adv/std_reasoning": 0.5227888226509094, "adv/std_step_conf": 0.9357995390892029, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.28515625, "calib/ece": 0.10900398406374504, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.021861842105263096, "calib/mean_conf": 0.6622310756972111, "calib/mu_c": 0.6552631578947369, "calib/mu_w": 0.677125, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0449800796812749, "calib/std_conf": 0.05450065234484988, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804718779790278, "calib/step_q_c_n": 2098.0, "calib/step_q_gap": -0.017632496541309828, "calib/step_q_w": 0.5981043745203376, "calib/step_q_w_n": 1303.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 847.90625, "completions/mean_terminated_length": 868.2560424804688, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.1664, "grad_norm": 0.18850506842136383, "kl": 0.05255126953125, "learning_rate": 1.25e-06, "loss": -0.0312, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018288226798176765, "mask/share_reasoning": 0.7817035913467407, "mask/share_step_conf": 0.17657069861888885, "num_tokens": 47912449.0, "reward": 0.717280387878418, "reward_std": 0.18664982914924622, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7549937963485718, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3498794436454773, "step": 156 }, { "adv/mean_abs_final_conf": 0.765222430229187, "adv/mean_abs_reasoning": 0.3630412220954895, "adv/mean_abs_step_conf": 0.7838761210441589, "adv/ratio_final_to_reasoning": 2.1078114099888996, "adv/ratio_step_to_reasoning": 2.1591931531069455, "adv/std_final_conf": 0.928774356842041, "adv/std_reasoning": 0.6402490735054016, "adv/std_step_conf": 0.9357649683952332, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.8828125, "calib/ece": 0.06405511811023627, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.007035714285714478, "calib/mean_conf": 0.6668110236220472, "calib/mu_c": 0.6687500000000001, "calib/mu_w": 0.6617142857142856, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0032283464566929135, "calib/std_conf": 0.0634666794943521, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5885262689225289, "calib/step_q_c_n": 2246.0, "calib/step_q_gap": -0.01572278050712894, "calib/step_q_w": 0.6042490494296578, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 837.375, "completions/mean_terminated_length": 847.3043823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.16746666666666668, "grad_norm": 0.200291246175766, "kl": 0.05327606201171875, "learning_rate": 1.2222222222222223e-06, "loss": -0.0404, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0200999416410923, "mask/share_reasoning": 0.7814955711364746, "mask/share_step_conf": 0.18668577075004578, "num_tokens": 48230545.0, "reward": 0.7479537129402161, "reward_std": 0.22217577695846558, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7896058559417725, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.36411404609680176, "step": 157 }, { "adv/mean_abs_final_conf": 0.7526696920394897, "adv/mean_abs_reasoning": 0.3639039099216461, "adv/mean_abs_step_conf": 0.7706518769264221, "adv/ratio_final_to_reasoning": 2.0683198820302606, "adv/ratio_step_to_reasoning": 2.117734533526597, "adv/std_final_conf": 0.9300745129585266, "adv/std_reasoning": 0.6402790546417236, "adv/std_step_conf": 0.9360171556472778, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.05859375, "calib/ece": 0.18531496062992123, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.028512839059674278, "calib/mean_conf": 0.6668110236220472, "calib/mu_c": 0.6579428571428573, "calib/mu_w": 0.6864556962025316, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0815748031496063, "calib/std_conf": 0.06711480729909547, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804814636494945, "calib/step_q_c_n": 2077.0, "calib/step_q_gap": -0.02657224883075826, "calib/step_q_w": 0.6070537124802527, "calib/step_q_w_n": 1266.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 878.85546875, "completions/mean_terminated_length": 885.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.16853333333333334, "grad_norm": 0.1852419376373291, "kl": 0.05022430419921875, "learning_rate": 1.1944444444444446e-06, "loss": 0.0119, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019303487613797188, "mask/share_reasoning": 0.789633572101593, "mask/share_step_conf": 0.18325044214725494, "num_tokens": 48560772.0, "reward": 0.6829037666320801, "reward_std": 0.21487480401992798, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7624925374984741, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.26815879344940186, "step": 158 }, { "adv/mean_abs_final_conf": 0.7331850528717041, "adv/mean_abs_reasoning": 0.38067251443862915, "adv/mean_abs_step_conf": 0.7472409009933472, "adv/ratio_final_to_reasoning": 1.9260257178087017, "adv/ratio_step_to_reasoning": 1.9629494451294698, "adv/std_final_conf": 0.9297038912773132, "adv/std_reasoning": 0.6815901398658752, "adv/std_step_conf": 0.9354227185249329, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 12.9921875, "calib/ece": 0.14879999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007530248033877807, "calib/mean_conf": 0.65568, "calib/mu_c": 0.6533908045977012, "calib/mu_w": 0.660921052631579, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05423999999999999, "calib/std_conf": 0.05591902717322612, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5796999531176746, "calib/step_q_c_n": 2133.0, "calib/step_q_gap": -0.024432486111160223, "calib/step_q_w": 0.6041324392288349, "calib/step_q_w_n": 1193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 792.640625, "completions/mean_terminated_length": 811.6640625, "completions/min_length": 0.0, "completions/min_terminated_length": 374.0, "epoch": 0.1696, "grad_norm": 0.22308699786663055, "kl": 0.0548553466796875, "learning_rate": 1.1666666666666668e-06, "loss": -0.0441, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01981358602643013, "mask/share_reasoning": 0.7754086256027222, "mask/share_step_conf": 0.1813403069972992, "num_tokens": 48868472.0, "reward": 0.7687622308731079, "reward_std": 0.2001192569732666, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7621843814849854, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.4440900683403015, "step": 159 }, { "adv/mean_abs_final_conf": 0.762536883354187, "adv/mean_abs_reasoning": 0.4082333445549011, "adv/mean_abs_step_conf": 0.7621127367019653, "adv/ratio_final_to_reasoning": 1.86789465761447, "adv/ratio_step_to_reasoning": 1.8668556767034812, "adv/std_final_conf": 0.9294963479042053, "adv/std_reasoning": 0.68171626329422, "adv/std_step_conf": 0.9357032775878906, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 13.76171875, "calib/ece": 0.08971774193548387, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.020982956215104265, "calib/mean_conf": 0.6575403225806451, "calib/mu_c": 0.6506024096385542, "calib/mu_w": 0.6715853658536585, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03895161290322579, "calib/std_conf": 0.0517515621072553, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.579351145038168, "calib/step_q_c_n": 2096.0, "calib/step_q_gap": -0.02983596077823003, "calib/step_q_w": 0.609187105816398, "calib/step_q_w_n": 1427.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 804.671875, "completions/mean_terminated_length": 830.6290283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.17066666666666666, "grad_norm": 0.18031994998455048, "kl": 0.053371429443359375, "learning_rate": 1.138888888888889e-06, "loss": -0.1049, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018375877290964127, "mask/share_reasoning": 0.772445797920227, "mask/share_step_conf": 0.17792832851409912, "num_tokens": 49179308.0, "reward": 0.7028266191482544, "reward_std": 0.22729066014289856, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7426198720932007, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.33959585428237915, "step": 160 }, { "adv/mean_abs_final_conf": 0.747994065284729, "adv/mean_abs_reasoning": 0.27806806564331055, "adv/mean_abs_step_conf": 0.7542315125465393, "adv/ratio_final_to_reasoning": 2.6899675212766505, "adv/ratio_step_to_reasoning": 2.7123988898244193, "adv/std_final_conf": 0.9273564219474792, "adv/std_reasoning": 0.5725544691085815, "adv/std_step_conf": 0.9354497194290161, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.4296875, "calib/ece": 0.11035156249999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.003320174356443606, "calib/mean_conf": 0.6501953125, "calib/mu_c": 0.649378238341969, "calib/mu_w": 0.6526984126984126, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0033203125, "calib/std_conf": 0.04322408157529023, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5743053763440861, "calib/step_q_c_n": 2325.0, "calib/step_q_gap": -0.009486922372366702, "calib/step_q_w": 0.5837922987164528, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 776.5078125, "completions/mean_terminated_length": 782.6220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.17173333333333332, "grad_norm": 0.14907941222190857, "kl": 0.05158233642578125, "learning_rate": 1.111111111111111e-06, "loss": -0.0043, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020560087636113167, "mask/share_reasoning": 0.7882505655288696, "mask/share_step_conf": 0.18337681889533997, "num_tokens": 49482014.0, "reward": 0.7526839375495911, "reward_std": 0.1831292361021042, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.8006120920181274, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.3539745807647705, "step": 161 }, { "adv/mean_abs_final_conf": 0.7359354496002197, "adv/mean_abs_reasoning": 0.27248501777648926, "adv/mean_abs_step_conf": 0.748088002204895, "adv/ratio_final_to_reasoning": 2.700829042292094, "adv/ratio_step_to_reasoning": 2.7454280176920687, "adv/std_final_conf": 0.9246222972869873, "adv/std_reasoning": 0.5725796818733215, "adv/std_step_conf": 0.9342425465583801, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.2265625, "calib/ece": 0.15486166007905147, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.009047290640394223, "calib/mean_conf": 0.6549407114624506, "calib/mu_c": 0.6531527093596058, "calib/mu_w": 0.6622, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0037154150197628456, "calib/std_conf": 0.05366872882087633, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5769044585987262, "calib/step_q_c_n": 2355.0, "calib/step_q_gap": -0.04254070269159649, "calib/step_q_w": 0.6194451612903227, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 780.484375, "completions/mean_terminated_length": 792.873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.1728, "grad_norm": 0.17687027156352997, "kl": 0.05267333984375, "learning_rate": 1.0833333333333335e-06, "loss": -0.0147, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020576588809490204, "mask/share_reasoning": 0.7769348621368408, "mask/share_step_conf": 0.18686355650424957, "num_tokens": 49785962.0, "reward": 0.8413803577423096, "reward_std": 0.1559084951877594, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.8044047355651855, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.5221060514450073, "step": 162 }, { "adv/mean_abs_final_conf": 0.7620823383331299, "adv/mean_abs_reasoning": 0.3502871096134186, "adv/mean_abs_step_conf": 0.7656612992286682, "adv/ratio_final_to_reasoning": 2.1755934415462046, "adv/ratio_step_to_reasoning": 2.1858106627836293, "adv/std_final_conf": 0.930777370929718, "adv/std_reasoning": 0.6402459740638733, "adv/std_step_conf": 0.9347506761550903, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.125, "calib/ece": 0.07142292490118575, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.011347168734845314, "calib/mean_conf": 0.6695256916996047, "calib/mu_c": 0.6658479532163742, "calib/mu_w": 0.6771951219512196, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03252964426877467, "calib/std_conf": 0.057378263743452985, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5843631436314363, "calib/step_q_c_n": 2214.0, "calib/step_q_gap": -0.02153557248839255, "calib/step_q_w": 0.6058987161198288, "calib/step_q_w_n": 1402.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 904.625, "completions/mean_terminated_length": 915.351806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.17386666666666667, "grad_norm": 0.16011030972003937, "kl": 0.05437469482421875, "learning_rate": 1.0555555555555557e-06, "loss": -0.0748, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01856684312224388, "mask/share_reasoning": 0.7873039245605469, "mask/share_step_conf": 0.18241044878959656, "num_tokens": 50122378.0, "reward": 0.7258908748626709, "reward_std": 0.1946534365415573, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7635785341262817, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3569532632827759, "step": 163 }, { "adv/mean_abs_final_conf": 0.7561768293380737, "adv/mean_abs_reasoning": 0.3443371057510376, "adv/mean_abs_step_conf": 0.7469892501831055, "adv/ratio_final_to_reasoning": 2.1960364326370456, "adv/ratio_step_to_reasoning": 2.1693545008860973, "adv/std_final_conf": 0.9291502237319946, "adv/std_reasoning": 0.6402323842048645, "adv/std_step_conf": 0.9354658722877502, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.19140625, "calib/ece": 0.14956521739130432, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03309090909090917, "calib/mean_conf": 0.6747826086956522, "calib/mu_c": 0.6632727272727272, "calib/mu_w": 0.6963636363636364, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08608695652173909, "calib/std_conf": 0.05579865850025347, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5821397797989469, "calib/step_q_c_n": 2089.0, "calib/step_q_gap": -0.023320064760638526, "calib/step_q_w": 0.6054598445595855, "calib/step_q_w_n": 1544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 943.09765625, "completions/mean_terminated_length": 958.0675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.17493333333333333, "grad_norm": 0.1804104745388031, "kl": 0.05059051513671875, "learning_rate": 1.0277777777777777e-06, "loss": -0.0569, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01607145369052887, "mask/share_reasoning": 0.7968579530715942, "mask/share_step_conf": 0.1714455485343933, "num_tokens": 50469947.0, "reward": 0.6703404188156128, "reward_std": 0.2007838785648346, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7456773519515991, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.26844096183776855, "step": 164 }, { "adv/mean_abs_final_conf": 0.7765575647354126, "adv/mean_abs_reasoning": 0.42560875415802, "adv/mean_abs_step_conf": 0.7829569578170776, "adv/ratio_final_to_reasoning": 1.8245808084273856, "adv/ratio_step_to_reasoning": 1.8396166671101444, "adv/std_final_conf": 0.9294382929801941, "adv/std_reasoning": 0.6816999912261963, "adv/std_step_conf": 0.9358112812042236, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.54296875, "calib/ece": 0.10681102362204728, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.016648547207661957, "calib/mean_conf": 0.6614566929133859, "calib/mu_c": 0.6548366013071896, "calib/mu_w": 0.6714851485148515, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08295275590551185, "calib/std_conf": 0.05583866747974536, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804133961276817, "calib/step_q_c_n": 1911.0, "calib/step_q_gap": -0.019663724694940377, "calib/step_q_w": 0.6000771208226221, "calib/step_q_w_n": 1556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 900.40234375, "completions/mean_terminated_length": 907.4921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.176, "grad_norm": 0.18720674514770508, "kl": 0.0489501953125, "learning_rate": 1.0000000000000002e-06, "loss": -0.0342, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017843585461378098, "mask/share_reasoning": 0.8022754788398743, "mask/share_step_conf": 0.17206846177577972, "num_tokens": 50806026.0, "reward": 0.6247101426124573, "reward_std": 0.2394401729106903, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7400652170181274, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.19138632714748383, "step": 165 }, { "adv/mean_abs_final_conf": 0.7800743579864502, "adv/mean_abs_reasoning": 0.32665368914604187, "adv/mean_abs_step_conf": 0.7688565850257874, "adv/ratio_final_to_reasoning": 2.388077599937011, "adv/ratio_step_to_reasoning": 2.3537361143410913, "adv/std_final_conf": 0.9293928742408752, "adv/std_reasoning": 0.6185818314552307, "adv/std_step_conf": 0.9357060790061951, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.0390625, "calib/ece": 0.1617391304347827, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": -0.037626811594203, "calib/mean_conf": 0.6672727272727274, "calib/mu_c": 0.6570108695652174, "calib/mu_w": 0.6946376811594204, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05086956521739133, "calib/std_conf": 0.06375032145367865, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5857695729537367, "calib/step_q_c_n": 2248.0, "calib/step_q_gap": -0.031221252734336735, "calib/step_q_w": 0.6169908256880734, "calib/step_q_w_n": 1090.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 868.63671875, "completions/mean_terminated_length": 878.936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.17706666666666668, "grad_norm": 0.16540831327438354, "kl": 0.0525665283203125, "learning_rate": 9.722222222222224e-07, "loss": -0.0487, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01886698603630066, "mask/share_reasoning": 0.7834525108337402, "mask/share_step_conf": 0.1859617829322815, "num_tokens": 51134581.0, "reward": 0.8182835578918457, "reward_std": 0.2058083713054657, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7699328064918518, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.5252280235290527, "step": 166 }, { "adv/mean_abs_final_conf": 0.7705814838409424, "adv/mean_abs_reasoning": 0.2326258420944214, "adv/mean_abs_step_conf": 0.7521966695785522, "adv/ratio_final_to_reasoning": 3.3125360316940546, "adv/ratio_step_to_reasoning": 3.2335043381519077, "adv/std_final_conf": 0.926654577255249, "adv/std_reasoning": 0.5227459669113159, "adv/std_step_conf": 0.935153067111969, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.265625, "calib/ece": 0.13760784313725483, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.009097560975609653, "calib/mean_conf": 0.6663137254901962, "calib/mu_c": 0.6680975609756097, "calib/mu_w": 0.659, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.05485577889591566, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5880213903743315, "calib/step_q_c_n": 2805.0, "calib/step_q_gap": 0.00788602658414539, "calib/step_q_w": 0.5801353637901862, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 874.4453125, "completions/mean_terminated_length": 881.3306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.17813333333333334, "grad_norm": 0.12058939784765244, "kl": 0.046085357666015625, "learning_rate": 9.444444444444445e-07, "loss": -0.0172, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017945684492588043, "mask/share_reasoning": 0.7946851849555969, "mask/share_step_conf": 0.17955660820007324, "num_tokens": 51464047.0, "reward": 0.795635461807251, "reward_std": 0.16444918513298035, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8200753927230835, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4118204712867737, "step": 167 }, { "adv/mean_abs_final_conf": 0.7644122242927551, "adv/mean_abs_reasoning": 0.35399892926216125, "adv/mean_abs_step_conf": 0.7623947262763977, "adv/ratio_final_to_reasoning": 2.1593630971879403, "adv/ratio_step_to_reasoning": 2.1536639330109115, "adv/std_final_conf": 0.9302164316177368, "adv/std_reasoning": 0.6402161717414856, "adv/std_step_conf": 0.9356199502944946, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 15.15625, "calib/ece": 0.1268627450980392, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.01568627450980392, "calib/gap": -0.03547490347490334, "calib/mean_conf": 0.6865490196078431, "calib/mu_c": 0.6768108108108108, "calib/mu_w": 0.7122857142857142, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04396078431372551, "calib/std_conf": 0.07045407730460142, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5919820522824815, "calib/step_q_c_n": 2563.0, "calib/step_q_gap": -0.027364948476820028, "calib/step_q_w": 0.6193470007593015, "calib/step_q_w_n": 1317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 980.65234375, "completions/mean_terminated_length": 988.3740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.1792, "grad_norm": 0.1715572476387024, "kl": 0.0460968017578125, "learning_rate": 9.166666666666666e-07, "loss": -0.0237, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.016328081488609314, "mask/share_reasoning": 0.7952143549919128, "mask/share_step_conf": 0.18064509332180023, "num_tokens": 51819766.0, "reward": 0.7117259502410889, "reward_std": 0.21040663123130798, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7771878838539124, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3025140166282654, "step": 168 }, { "adv/mean_abs_final_conf": 0.7541103363037109, "adv/mean_abs_reasoning": 0.2814164161682129, "adv/mean_abs_step_conf": 0.7638496160507202, "adv/ratio_final_to_reasoning": 2.6796956146756967, "adv/ratio_step_to_reasoning": 2.7143036872239157, "adv/std_final_conf": 0.9286118745803833, "adv/std_reasoning": 0.5726398825645447, "adv/std_step_conf": 0.9354013800621033, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.765625, "calib/ece": 0.07549019607843138, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.00700406276267862, "calib/mean_conf": 0.6610588235294117, "calib/mu_c": 0.6587790697674418, "calib/mu_w": 0.6657831325301204, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.031019607843137287, "calib/std_conf": 0.049602898304278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5780000000000001, "calib/step_q_c_n": 2070.0, "calib/step_q_gap": -0.01578964941569272, "calib/step_q_w": 0.5937896494156928, "calib/step_q_w_n": 1198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 868.57421875, "completions/mean_terminated_length": 875.4133911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.18026666666666666, "grad_norm": 0.14597614109516144, "kl": 0.0505218505859375, "learning_rate": 8.88888888888889e-07, "loss": -0.0084, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018267178907990456, "mask/share_reasoning": 0.795763373374939, "mask/share_step_conf": 0.17815694212913513, "num_tokens": 52146305.0, "reward": 0.6932387351989746, "reward_std": 0.19367341697216034, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7717105150222778, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2811731696128845, "step": 169 }, { "adv/mean_abs_final_conf": 0.7423576712608337, "adv/mean_abs_reasoning": 0.2969627380371094, "adv/mean_abs_step_conf": 0.7681744694709778, "adv/ratio_final_to_reasoning": 2.4998344107672743, "adv/ratio_step_to_reasoning": 2.5867705643762764, "adv/std_final_conf": 0.9297893643379211, "adv/std_reasoning": 0.5959508419036865, "adv/std_step_conf": 0.9360413551330566, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.5078125, "calib/ece": 0.0816862745098039, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.002643724696356209, "calib/mean_conf": 0.6724313725490195, "calib/mu_c": 0.6731052631578947, "calib/mu_w": 0.6704615384615384, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004509803921568629, "calib/std_conf": 0.05080885166897625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5863885530028214, "calib/step_q_c_n": 2481.0, "calib/step_q_gap": -0.010909297560126374, "calib/step_q_w": 0.5972978505629478, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 900.76953125, "completions/mean_terminated_length": 904.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.18133333333333335, "grad_norm": 0.19842861592769623, "kl": 0.047882080078125, "learning_rate": 8.611111111111112e-07, "loss": -0.0151, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017363911494612694, "mask/share_reasoning": 0.7947203516960144, "mask/share_step_conf": 0.18400946259498596, "num_tokens": 52481054.0, "reward": 0.7063318490982056, "reward_std": 0.20461004972457886, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.8000777363777161, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.26492974162101746, "step": 170 }, { "adv/mean_abs_final_conf": 0.7441630363464355, "adv/mean_abs_reasoning": 0.26128995418548584, "adv/mean_abs_step_conf": 0.7661536335945129, "adv/ratio_final_to_reasoning": 2.8480353891377135, "adv/ratio_step_to_reasoning": 2.932197052821372, "adv/std_final_conf": 0.9314100742340088, "adv/std_reasoning": 0.5725415945053101, "adv/std_step_conf": 0.93570876121521, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.25, "calib/ece": 0.08803921568627451, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007858428700012854, "calib/mean_conf": 0.6673333333333334, "calib/mu_c": 0.6642207792207792, "calib/mu_w": 0.672079207920792, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07572549019607842, "calib/std_conf": 0.05861762656224533, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5893823116518485, "calib/step_q_c_n": 2137.0, "calib/step_q_gap": -0.0010412489040747541, "calib/step_q_w": 0.5904235605559233, "calib/step_q_w_n": 1511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 879.234375, "completions/mean_terminated_length": 882.6824340820312, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.1824, "grad_norm": 0.18084461987018585, "kl": 0.0527496337890625, "learning_rate": 8.333333333333333e-07, "loss": -0.0115, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01803310588002205, "mask/share_reasoning": 0.7983774542808533, "mask/share_step_conf": 0.17968320846557617, "num_tokens": 52813034.0, "reward": 0.6411213278770447, "reward_std": 0.1809045970439911, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7466551065444946, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.21605640649795532, "step": 171 }, { "adv/mean_abs_final_conf": 0.7528913021087646, "adv/mean_abs_reasoning": 0.309874027967453, "adv/mean_abs_step_conf": 0.7537387609481812, "adv/ratio_final_to_reasoning": 2.429668943367668, "adv/ratio_step_to_reasoning": 2.4324037928966042, "adv/std_final_conf": 0.9280959963798523, "adv/std_reasoning": 0.5960396528244019, "adv/std_step_conf": 0.9357143640518188, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.8828125, "calib/ece": 0.198740157480315, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02959276980329617, "calib/mean_conf": 0.6540944881889763, "calib/mu_c": 0.6488516746411483, "calib/mu_w": 0.6784444444444445, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.015000000000000001, "calib/std_conf": 0.055992889462925036, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5778473662380755, "calib/step_q_c_n": 2411.0, "calib/step_q_gap": -0.008571017280149529, "calib/step_q_w": 0.5864183835182251, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 820.38671875, "completions/mean_terminated_length": 826.846435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.18346666666666667, "grad_norm": 0.15892651677131653, "kl": 0.0509033203125, "learning_rate": 8.055555555555557e-07, "loss": -0.036, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019890522584319115, "mask/share_reasoning": 0.7924667596817017, "mask/share_step_conf": 0.17983026802539825, "num_tokens": 53126405.0, "reward": 0.8330913782119751, "reward_std": 0.19977548718452454, "rewards/accuracy_reward_step": 0.81640625, "rewards/final_brier_reward_step": 0.8076265454292297, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.4968373775482178, "step": 172 }, { "adv/mean_abs_final_conf": 0.761202871799469, "adv/mean_abs_reasoning": 0.23559096455574036, "adv/mean_abs_step_conf": 0.7611009478569031, "adv/ratio_final_to_reasoning": 3.2310359322773174, "adv/ratio_step_to_reasoning": 3.2306033013283413, "adv/std_final_conf": 0.9258071184158325, "adv/std_reasoning": 0.5226951241493225, "adv/std_step_conf": 0.9345263838768005, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.8671875, "calib/ece": 0.12457031249999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.003093838099073598, "calib/mean_conf": 0.6675390625, "calib/mu_c": 0.6683246073298429, "calib/mu_w": 0.6652307692307693, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023007812500000006, "calib/std_conf": 0.07051566784496262, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5879166666666666, "calib/step_q_c_n": 2472.0, "calib/step_q_gap": -0.005185523114355273, "calib/step_q_w": 0.5931021897810219, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 870.73046875, "completions/mean_terminated_length": 877.5866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.18453333333333333, "grad_norm": 0.16663791239261627, "kl": 0.0567626953125, "learning_rate": 7.777777777777779e-07, "loss": 0.0124, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0201319120824337, "mask/share_reasoning": 0.7883695960044861, "mask/share_step_conf": 0.1836860179901123, "num_tokens": 53452472.0, "reward": 0.7370842695236206, "reward_std": 0.1307639330625534, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.800590991973877, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.32435885071754456, "step": 173 }, { "adv/mean_abs_final_conf": 0.7598955035209656, "adv/mean_abs_reasoning": 0.5007636547088623, "adv/mean_abs_step_conf": 0.7425982356071472, "adv/ratio_final_to_reasoning": 1.5174733556946325, "adv/ratio_step_to_reasoning": 1.4829315758526536, "adv/std_final_conf": 0.9315431714057922, "adv/std_reasoning": 0.7575818300247192, "adv/std_step_conf": 0.9360539317131042, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.90234375, "calib/ece": 0.08381526104417669, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0014946985282483505, "calib/mean_conf": 0.6689558232931727, "calib/mu_c": 0.6693820224719101, "calib/mu_w": 0.6678873239436618, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.018955823293172715, "calib/std_conf": 0.058698347850569405, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5949143767423337, "calib/step_q_c_n": 2511.0, "calib/step_q_gap": -0.006431043104994538, "calib/step_q_w": 0.6013454198473283, "calib/step_q_w_n": 1048.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 896.1875, "completions/mean_terminated_length": 917.696044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.1856, "grad_norm": 0.47667044401168823, "kl": 0.049530029296875, "learning_rate": 7.5e-07, "loss": -0.0361, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017272762954235077, "mask/share_reasoning": 0.7841352224349976, "mask/share_step_conf": 0.17515450716018677, "num_tokens": 53786128.0, "reward": 0.7226041555404663, "reward_std": 0.24592873454093933, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7657800912857056, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.34661564230918884, "step": 174 }, { "adv/mean_abs_final_conf": 0.7577539682388306, "adv/mean_abs_reasoning": 0.44572803378105164, "adv/mean_abs_step_conf": 0.7865190505981445, "adv/ratio_final_to_reasoning": 1.7000365936396336, "adv/ratio_step_to_reasoning": 1.7645716468093964, "adv/std_final_conf": 0.9305837750434875, "adv/std_reasoning": 0.7013881802558899, "adv/std_step_conf": 0.9361911416053772, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.7109375, "calib/ece": 0.15802371541501978, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.04027268093781855, "calib/mean_conf": 0.6776284584980237, "calib/mu_c": 0.6602777777777779, "calib/mu_w": 0.7005504587155964, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1332411067193676, "calib/std_conf": 0.06925855421827964, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5890109890109889, "calib/step_q_c_n": 1911.0, "calib/step_q_gap": -0.02031515654157179, "calib/step_q_w": 0.6093261455525607, "calib/step_q_w_n": 1855.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 928.3125, "completions/mean_terminated_length": 943.0476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 289.0, "epoch": 0.18666666666666668, "grad_norm": 0.17033083736896515, "kl": 0.04911041259765625, "learning_rate": 7.222222222222222e-07, "loss": -0.0705, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018152743577957153, "mask/share_reasoning": 0.7822288274765015, "mask/share_step_conf": 0.18399344384670258, "num_tokens": 54129600.0, "reward": 0.6329188942909241, "reward_std": 0.24493342638015747, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7100539207458496, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.24562771618366241, "step": 175 }, { "adv/mean_abs_final_conf": 0.7080068588256836, "adv/mean_abs_reasoning": 0.3450409770011902, "adv/mean_abs_step_conf": 0.744574785232544, "adv/ratio_final_to_reasoning": 2.0519500755507116, "adv/ratio_step_to_reasoning": 2.157931477309652, "adv/std_final_conf": 0.9148129820823669, "adv/std_reasoning": 0.6401710510253906, "adv/std_step_conf": 0.9354899525642395, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.328125, "calib/ece": 0.06141176470588236, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.004428571428571448, "calib/mean_conf": 0.6659607843137255, "calib/mu_c": 0.6645714285714286, "calib/mu_w": 0.669, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.020549019607843125, "calib/std_conf": 0.06376856462771426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5853746654772525, "calib/step_q_c_n": 2242.0, "calib/step_q_gap": -0.013078325975738991, "calib/step_q_w": 0.5984529914529915, "calib/step_q_w_n": 1170.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 885.89453125, "completions/mean_terminated_length": 892.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.18773333333333334, "grad_norm": 0.19338367879390717, "kl": 0.04975128173828125, "learning_rate": 6.944444444444446e-07, "loss": 0.0051, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019442148506641388, "mask/share_reasoning": 0.7906622886657715, "mask/share_step_conf": 0.18208308517932892, "num_tokens": 54460453.0, "reward": 0.7801279425621033, "reward_std": 0.1804857850074768, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7752718925476074, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4490464925765991, "step": 176 }, { "adv/mean_abs_final_conf": 0.7342836260795593, "adv/mean_abs_reasoning": 0.28962162137031555, "adv/mean_abs_step_conf": 0.7492353916168213, "adv/ratio_final_to_reasoning": 2.5353204729859953, "adv/ratio_step_to_reasoning": 2.586945643325555, "adv/std_final_conf": 0.9285452961921692, "adv/std_reasoning": 0.595961332321167, "adv/std_step_conf": 0.9352489113807678, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.16015625, "calib/ece": 0.10365079365079358, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03676552106430164, "calib/mean_conf": 0.6577777777777778, "calib/mu_c": 0.6449390243902439, "calib/mu_w": 0.6817045454545455, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05531746031746029, "calib/std_conf": 0.05584685534749997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5758350730688936, "calib/step_q_c_n": 1916.0, "calib/step_q_gap": -0.025871740420438827, "calib/step_q_w": 0.6017068134893324, "calib/step_q_w_n": 1453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 829.31640625, "completions/mean_terminated_length": 839.1502075195312, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.1888, "grad_norm": 0.16369785368442535, "kl": 0.0509490966796875, "learning_rate": 6.666666666666667e-07, "loss": -0.0189, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019376084208488464, "mask/share_reasoning": 0.7894700765609741, "mask/share_step_conf": 0.17943505942821503, "num_tokens": 54776590.0, "reward": 0.7116050124168396, "reward_std": 0.17901837825775146, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7410968542098999, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.357113242149353, "step": 177 }, { "adv/mean_abs_final_conf": 0.7370127439498901, "adv/mean_abs_reasoning": 0.3701658248901367, "adv/mean_abs_step_conf": 0.7550868391990662, "adv/ratio_final_to_reasoning": 1.991034002581496, "adv/ratio_step_to_reasoning": 2.03986102559082, "adv/std_final_conf": 0.9300718307495117, "adv/std_reasoning": 0.6611856818199158, "adv/std_step_conf": 0.9358887076377869, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.171875, "calib/ece": 0.06717647058823525, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.012167721518987262, "calib/mean_conf": 0.6750196078431371, "calib/mu_c": 0.67125, "calib/mu_w": 0.6834177215189873, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.026000000000000013, "calib/std_conf": 0.06235178195987427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.589878508588186, "calib/step_q_c_n": 2387.0, "calib/step_q_gap": -0.011475238390057263, "calib/step_q_w": 0.6013537469782433, "calib/step_q_w_n": 1241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 865.33984375, "completions/mean_terminated_length": 872.153564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.18986666666666666, "grad_norm": 0.16527439653873444, "kl": 0.04903411865234375, "learning_rate": 6.388888888888889e-07, "loss": -0.0028, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01791071519255638, "mask/share_reasoning": 0.7861236333847046, "mask/share_step_conf": 0.18815310299396515, "num_tokens": 55104189.0, "reward": 0.7469719648361206, "reward_std": 0.22839292883872986, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7738183736801147, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.38340672850608826, "step": 178 }, { "adv/mean_abs_final_conf": 0.750512957572937, "adv/mean_abs_reasoning": 0.316341757774353, "adv/mean_abs_step_conf": 0.7563944458961487, "adv/ratio_final_to_reasoning": 2.3724751447713675, "adv/ratio_step_to_reasoning": 2.3910673419083857, "adv/std_final_conf": 0.9268893003463745, "adv/std_reasoning": 0.6185594797134399, "adv/std_step_conf": 0.9358287453651428, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.703125, "calib/ece": 0.15624000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.04347124642206612, "calib/mean_conf": 0.6664800000000001, "calib/mu_c": 0.6558730158730159, "calib/mu_w": 0.6993442622950821, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03336, "calib/std_conf": 0.05700885545246458, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5797626112759644, "calib/step_q_c_n": 2359.0, "calib/step_q_gap": -0.032500225973818075, "calib/step_q_w": 0.6122628372497825, "calib/step_q_w_n": 1149.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 876.16796875, "completions/mean_terminated_length": 893.6215209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.19093333333333334, "grad_norm": 0.2310127168893814, "kl": 0.04970550537109375, "learning_rate": 6.111111111111112e-07, "loss": -0.0686, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018145693466067314, "mask/share_reasoning": 0.7825173139572144, "mask/share_step_conf": 0.179805725812912, "num_tokens": 55434752.0, "reward": 0.7343745231628418, "reward_std": 0.21182644367218018, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7697601914405823, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3560202717781067, "step": 179 }, { "adv/mean_abs_final_conf": 0.7362815141677856, "adv/mean_abs_reasoning": 0.3285897374153137, "adv/mean_abs_step_conf": 0.7740625143051147, "adv/ratio_final_to_reasoning": 2.2407319229120626, "adv/ratio_step_to_reasoning": 2.3557111685650596, "adv/std_final_conf": 0.9302303194999695, "adv/std_reasoning": 0.6184530854225159, "adv/std_step_conf": 0.936010479927063, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 15.328125, "calib/ece": 0.08861111111111107, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": -0.010345357430306512, "calib/mean_conf": 0.6882142857142858, "calib/mu_c": 0.684971098265896, "calib/mu_w": 0.6953164556962025, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04515873015873011, "calib/std_conf": 0.06839707911043426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5966389219183512, "calib/step_q_c_n": 2523.0, "calib/step_q_gap": -0.020191913199421818, "calib/step_q_w": 0.616830835117773, "calib/step_q_w_n": 1401.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 1002.41796875, "completions/mean_terminated_length": 1022.386474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.192, "grad_norm": 0.13480441272258759, "kl": 0.041332244873046875, "learning_rate": 5.833333333333334e-07, "loss": 0.0373, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.015691669657826424, "mask/share_reasoning": 0.7890633344650269, "mask/share_step_conf": 0.17571374773979187, "num_tokens": 55795227.0, "reward": 0.7093750238418579, "reward_std": 0.20935778319835663, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7635316252708435, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3231871724128723, "step": 180 }, { "adv/mean_abs_final_conf": 0.7619613409042358, "adv/mean_abs_reasoning": 0.2994126081466675, "adv/mean_abs_step_conf": 0.7267587184906006, "adv/ratio_final_to_reasoning": 2.544853891159415, "adv/ratio_step_to_reasoning": 2.427281613119636, "adv/std_final_conf": 0.9292603135108948, "adv/std_reasoning": 0.5959325432777405, "adv/std_step_conf": 0.9358544945716858, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.10546875, "calib/ece": 0.03640625000000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.0017675657675658707, "calib/mean_conf": 0.665234375, "calib/mu_c": 0.6646060606060605, "calib/mu_w": 0.6663736263736264, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028554687500000016, "calib/std_conf": 0.05269257602698292, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5831547064305685, "calib/step_q_c_n": 2146.0, "calib/step_q_gap": -0.004893267101276022, "calib/step_q_w": 0.5880479735318446, "calib/step_q_w_n": 1209.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 842.7578125, "completions/mean_terminated_length": 849.3936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.19306666666666666, "grad_norm": 0.1750410497188568, "kl": 0.05321502685546875, "learning_rate": 5.555555555555555e-07, "loss": -0.027, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01841357722878456, "mask/share_reasoning": 0.7919554114341736, "mask/share_step_conf": 0.18181854486465454, "num_tokens": 56117237.0, "reward": 0.6624540686607361, "reward_std": 0.19779205322265625, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7668741941452026, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.2291276752948761, "step": 181 }, { "adv/mean_abs_final_conf": 0.745903730392456, "adv/mean_abs_reasoning": 0.32951363921165466, "adv/mean_abs_step_conf": 0.7798632383346558, "adv/ratio_final_to_reasoning": 2.2636505492670786, "adv/ratio_step_to_reasoning": 2.366710040289806, "adv/std_final_conf": 0.9296409487724304, "adv/std_reasoning": 0.6184464693069458, "adv/std_step_conf": 0.9356173276901245, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.8125, "calib/ece": 0.0877734375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014350877192982447, "calib/mean_conf": 0.6644140624999999, "calib/mu_c": 0.6596491228070175, "calib/mu_w": 0.6739999999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.042109375, "calib/std_conf": 0.05715631353792942, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5816244822825586, "calib/step_q_c_n": 2173.0, "calib/step_q_gap": -0.004220142830359319, "calib/step_q_w": 0.5858446251129179, "calib/step_q_w_n": 1107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 863.03125, "completions/mean_terminated_length": 869.8267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.19413333333333332, "grad_norm": 0.18245761096477509, "kl": 0.0487518310546875, "learning_rate": 5.277777777777779e-07, "loss": 0.0171, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018086858093738556, "mask/share_reasoning": 0.7957003116607666, "mask/share_step_conf": 0.17840032279491425, "num_tokens": 56444333.0, "reward": 0.6843788623809814, "reward_std": 0.2277456820011139, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7685683965682983, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.26659566164016724, "step": 182 }, { "adv/mean_abs_final_conf": 0.7563251256942749, "adv/mean_abs_reasoning": 0.45584434270858765, "adv/mean_abs_step_conf": 0.7536901831626892, "adv/ratio_final_to_reasoning": 1.6591740970179787, "adv/ratio_step_to_reasoning": 1.6533937411273492, "adv/std_final_conf": 0.9308493733406067, "adv/std_reasoning": 0.7205421924591064, "adv/std_step_conf": 0.9354673624038696, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 14.375, "calib/ece": 0.07956862745098046, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.016951846657316527, "calib/mean_conf": 0.674156862745098, "calib/mu_c": 0.6695698924731183, "calib/mu_w": 0.6865217391304348, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01215686274509806, "calib/std_conf": 0.05470755496775367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5892304702681695, "calib/step_q_c_n": 2573.0, "calib/step_q_gap": -0.015195907328939828, "calib/step_q_w": 0.6044263775971094, "calib/step_q_w_n": 1107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 921.55078125, "completions/mean_terminated_length": 928.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.1952, "grad_norm": 0.2044658213853836, "kl": 0.0491943359375, "learning_rate": 5.000000000000001e-07, "loss": 0.0288, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017305593937635422, "mask/share_reasoning": 0.7930552959442139, "mask/share_step_conf": 0.1818266063928604, "num_tokens": 56786930.0, "reward": 0.7817938923835754, "reward_std": 0.21965757012367249, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.786806583404541, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.43224990367889404, "step": 183 }, { "adv/mean_abs_final_conf": 0.748545229434967, "adv/mean_abs_reasoning": 0.3237451910972595, "adv/mean_abs_step_conf": 0.7547122240066528, "adv/ratio_final_to_reasoning": 2.3121431607924303, "adv/ratio_step_to_reasoning": 2.3311920756219733, "adv/std_final_conf": 0.9301021099090576, "adv/std_reasoning": 0.6184375882148743, "adv/std_step_conf": 0.9356794357299805, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.68359375, "calib/ece": 0.10901185770750993, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.013689839572192497, "calib/mean_conf": 0.6589723320158102, "calib/mu_c": 0.6554010695187166, "calib/mu_w": 0.6690909090909091, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014426877470355716, "calib/std_conf": 0.054813114264309196, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.579004424778761, "calib/step_q_c_n": 2260.0, "calib/step_q_gap": -0.018432252019617956, "calib/step_q_w": 0.597436676798379, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 865.20703125, "completions/mean_terminated_length": 878.9405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.19626666666666667, "grad_norm": 0.14740177989006042, "kl": 0.0485382080078125, "learning_rate": 4.7222222222222226e-07, "loss": -0.0276, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017934516072273254, "mask/share_reasoning": 0.7966513633728027, "mask/share_step_conf": 0.16978907585144043, "num_tokens": 57113703.0, "reward": 0.7120828032493591, "reward_std": 0.19992341101169586, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7831875085830688, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2972281277179718, "step": 184 }, { "adv/mean_abs_final_conf": 0.750738799571991, "adv/mean_abs_reasoning": 0.2984664738178253, "adv/mean_abs_step_conf": 0.7430471181869507, "adv/ratio_final_to_reasoning": 2.5153203640225894, "adv/ratio_step_to_reasoning": 2.489549692741985, "adv/std_final_conf": 0.9285163283348083, "adv/std_reasoning": 0.5960050821304321, "adv/std_step_conf": 0.9359565377235413, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 14.265625, "calib/ece": 0.17133858267716545, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.026795142969056163, "calib/mean_conf": 0.664251968503937, "calib/mu_c": 0.656972972972973, "calib/mu_w": 0.6837681159420291, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05362204724409457, "calib/std_conf": 0.059697659314424174, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5884972562262558, "calib/step_q_c_n": 2369.0, "calib/step_q_gap": -0.0397256588166125, "calib/step_q_w": 0.6282229150428683, "calib/step_q_w_n": 1283.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 883.109375, "completions/mean_terminated_length": 893.5810546875, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.19733333333333333, "grad_norm": 0.16363385319709778, "kl": 0.042522430419921875, "learning_rate": 4.444444444444445e-07, "loss": 0.0004, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018001873046159744, "mask/share_reasoning": 0.7912944555282593, "mask/share_step_conf": 0.17898491024971008, "num_tokens": 57446699.0, "reward": 0.7514989972114563, "reward_std": 0.2056909203529358, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7777429819107056, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.38228633999824524, "step": 185 }, { "adv/mean_abs_final_conf": 0.7524923086166382, "adv/mean_abs_reasoning": 0.2546382546424866, "adv/mean_abs_step_conf": 0.738852858543396, "adv/ratio_final_to_reasoning": 2.955142422230082, "adv/ratio_step_to_reasoning": 2.901578396304786, "adv/std_final_conf": 0.9246767163276672, "adv/std_reasoning": 0.5482885241508484, "adv/std_step_conf": 0.9352895617485046, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.984375, "calib/ece": 0.16707509881422924, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015810276679841896, "calib/gap": -0.044274615774362536, "calib/mean_conf": 0.6678656126482213, "calib/mu_c": 0.6570157068062827, "calib/mu_w": 0.7012903225806453, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.040000000000000036, "calib/std_conf": 0.06039551559243489, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5798088907353551, "calib/step_q_c_n": 2407.0, "calib/step_q_gap": -0.03750568726976011, "calib/step_q_w": 0.6173145780051152, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 889.66796875, "completions/mean_terminated_length": 903.7897338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.1984, "grad_norm": 0.15272776782512665, "kl": 0.0468902587890625, "learning_rate": 4.1666666666666667e-07, "loss": -0.0542, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01797938346862793, "mask/share_reasoning": 0.7882766723632812, "mask/share_step_conf": 0.17811891436576843, "num_tokens": 57779494.0, "reward": 0.741277813911438, "reward_std": 0.1670185625553131, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.7761093378067017, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.36035263538360596, "step": 186 }, { "adv/mean_abs_final_conf": 0.7613424062728882, "adv/mean_abs_reasoning": 0.48220592737197876, "adv/mean_abs_step_conf": 0.772615373134613, "adv/ratio_final_to_reasoning": 1.5788740101603533, "adv/ratio_step_to_reasoning": 1.6022519203473196, "adv/std_final_conf": 0.9324616193771362, "adv/std_reasoning": 0.7392339110374451, "adv/std_step_conf": 0.936041533946991, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 15.078125, "calib/ece": 0.1039759036144578, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.018737546699875596, "calib/mean_conf": 0.6740160642570281, "calib/mu_c": 0.6685227272727272, "calib/mu_w": 0.6872602739726028, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03558232931726907, "calib/std_conf": 0.06741896625591312, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5895748160261651, "calib/step_q_c_n": 2446.0, "calib/step_q_gap": -0.04200227025389147, "calib/step_q_w": 0.6315770862800566, "calib/step_q_w_n": 1414.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 897.34375, "completions/mean_terminated_length": 926.290283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.19946666666666665, "grad_norm": 0.2161259949207306, "kl": 0.0449066162109375, "learning_rate": 3.8888888888888895e-07, "loss": -0.0177, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017458781599998474, "mask/share_reasoning": 0.7766166925430298, "mask/share_step_conf": 0.17467457056045532, "num_tokens": 58110758.0, "reward": 0.7140494585037231, "reward_std": 0.25206685066223145, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7580785155296326, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.33798912167549133, "step": 187 }, { "adv/mean_abs_final_conf": 0.7479918003082275, "adv/mean_abs_reasoning": 0.267370343208313, "adv/mean_abs_step_conf": 0.769452691078186, "adv/ratio_final_to_reasoning": 2.7975870148225597, "adv/ratio_step_to_reasoning": 2.8778535489206885, "adv/std_final_conf": 0.9274735450744629, "adv/std_reasoning": 0.5482823848724365, "adv/std_step_conf": 0.9348469376564026, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 14.10546875, "calib/ece": 0.1229644268774703, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.026561677758860647, "calib/mean_conf": 0.668498023715415, "calib/mu_c": 0.661043956043956, "calib/mu_w": 0.6876056338028167, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0360474308300395, "calib/std_conf": 0.05998778693262936, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5861072164948453, "calib/step_q_c_n": 2425.0, "calib/step_q_gap": -0.02619463847985959, "calib/step_q_w": 0.6123018549747049, "calib/step_q_w_n": 1186.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 909.1875, "completions/mean_terminated_length": 919.9684448242188, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.20053333333333334, "grad_norm": 0.11366663873195648, "kl": 0.046966552734375, "learning_rate": 3.611111111111111e-07, "loss": -0.04, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01853599213063717, "mask/share_reasoning": 0.7887281775474548, "mask/share_step_conf": 0.18101707100868225, "num_tokens": 58447582.0, "reward": 0.7634593844413757, "reward_std": 0.1651395559310913, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7720566391944885, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.41501832008361816, "step": 188 }, { "adv/mean_abs_final_conf": 0.7274742126464844, "adv/mean_abs_reasoning": 0.3696415424346924, "adv/mean_abs_step_conf": 0.7473438382148743, "adv/ratio_final_to_reasoning": 1.9680531789118731, "adv/ratio_step_to_reasoning": 2.0218069465147135, "adv/std_final_conf": 0.9266048073768616, "adv/std_reasoning": 0.6611664295196533, "adv/std_step_conf": 0.936164140701294, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.984375, "calib/ece": 0.10409448818897638, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.021564311394820068, "calib/mean_conf": 0.6561417322834645, "calib/mu_c": 0.6496045197740112, "calib/mu_w": 0.6711688311688313, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.031692913385826756, "calib/std_conf": 0.05016582335833497, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5773882017126547, "calib/step_q_c_n": 2102.0, "calib/step_q_gap": -0.017619981593401057, "calib/step_q_w": 0.5950081833060558, "calib/step_q_w_n": 1222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 823.2421875, "completions/mean_terminated_length": 833.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.2016, "grad_norm": 0.21445226669311523, "kl": 0.04804229736328125, "learning_rate": 3.3333333333333335e-07, "loss": -0.0545, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01917072758078575, "mask/share_reasoning": 0.7900059223175049, "mask/share_step_conf": 0.17910461127758026, "num_tokens": 58766100.0, "reward": 0.7019118666648865, "reward_std": 0.24199704825878143, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7694070339202881, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.2976979613304138, "step": 189 }, { "adv/mean_abs_final_conf": 0.7516705393791199, "adv/mean_abs_reasoning": 0.3387170433998108, "adv/mean_abs_step_conf": 0.7623640298843384, "adv/ratio_final_to_reasoning": 2.2191695222489054, "adv/ratio_step_to_reasoning": 2.250740093360074, "adv/std_final_conf": 0.9286521673202515, "adv/std_reasoning": 0.6185595989227295, "adv/std_step_conf": 0.9362083673477173, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.07421875, "calib/ece": 0.08423529411764709, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.020256869772998787, "calib/mean_conf": 0.6749803921568628, "calib/mu_c": 0.6675925925925926, "calib/mu_w": 0.6878494623655914, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06196078431372551, "calib/std_conf": 0.05709211346109072, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5859443631039531, "calib/step_q_c_n": 2049.0, "calib/step_q_gap": -0.006744388822086944, "calib/step_q_w": 0.5926887519260401, "calib/step_q_w_n": 1298.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 935.9375, "completions/mean_terminated_length": 943.3070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.20266666666666666, "grad_norm": 0.1718294471502304, "kl": 0.0427703857421875, "learning_rate": 3.055555555555556e-07, "loss": -0.0173, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0172400064766407, "mask/share_reasoning": 0.7973253726959229, "mask/share_step_conf": 0.17762216925621033, "num_tokens": 59111308.0, "reward": 0.6847772598266602, "reward_std": 0.23699063062667847, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7511374950408936, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2926359176635742, "step": 190 }, { "adv/mean_abs_final_conf": 0.7656644582748413, "adv/mean_abs_reasoning": 0.2647988796234131, "adv/mean_abs_step_conf": 0.7759451270103455, "adv/ratio_final_to_reasoning": 2.8914943271804634, "adv/ratio_step_to_reasoning": 2.9303187691498738, "adv/std_final_conf": 0.9286467432975769, "adv/std_reasoning": 0.5482805371284485, "adv/std_step_conf": 0.9354783296585083, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.5234375, "calib/ece": 0.08047430830039534, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01485738255033553, "calib/mean_conf": 0.66, "calib/mu_c": 0.6538926174496644, "calib/mu_w": 0.66875, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07577075098814239, "calib/std_conf": 0.04730691983242642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804259850905218, "calib/step_q_c_n": 1878.0, "calib/step_q_gap": -0.016499519959983222, "calib/step_q_w": 0.596925505050505, "calib/step_q_w_n": 1584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 831.8828125, "completions/mean_terminated_length": 841.7470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.20373333333333332, "grad_norm": 0.1342456191778183, "kl": 0.05359649658203125, "learning_rate": 2.7777777777777776e-07, "loss": -0.0491, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019521009176969528, "mask/share_reasoning": 0.7809332609176636, "mask/share_step_conf": 0.1878269612789154, "num_tokens": 59428438.0, "reward": 0.6383215188980103, "reward_std": 0.16677504777908325, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7347148656845093, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.22786563634872437, "step": 191 }, { "adv/mean_abs_final_conf": 0.7587395906448364, "adv/mean_abs_reasoning": 0.2984263002872467, "adv/mean_abs_step_conf": 0.7594149708747864, "adv/ratio_final_to_reasoning": 2.5424689107981453, "adv/ratio_step_to_reasoning": 2.544732049902507, "adv/std_final_conf": 0.926181972026825, "adv/std_reasoning": 0.5960554480552673, "adv/std_step_conf": 0.9353460073471069, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.73046875, "calib/ece": 0.14039525691699598, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.038947330132401925, "calib/mean_conf": 0.6573913043478262, "calib/mu_c": 0.6452298850574714, "calib/mu_w": 0.6841772151898733, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.055019762845849786, "calib/std_conf": 0.05948654681899828, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5767428003972195, "calib/step_q_c_n": 2014.0, "calib/step_q_gap": -0.027241135345752454, "calib/step_q_w": 0.6039839357429719, "calib/step_q_w_n": 1245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 850.34375, "completions/mean_terminated_length": 860.4269409179688, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.2048, "grad_norm": 0.16092029213905334, "kl": 0.05052947998046875, "learning_rate": 2.5000000000000004e-07, "loss": -0.0241, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020145608112215996, "mask/share_reasoning": 0.7876855134963989, "mask/share_step_conf": 0.18045015633106232, "num_tokens": 59751102.0, "reward": 0.7235573530197144, "reward_std": 0.19452857971191406, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.755107045173645, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.35841381549835205, "step": 192 }, { "adv/mean_abs_final_conf": 0.7481487989425659, "adv/mean_abs_reasoning": 0.38850438594818115, "adv/mean_abs_step_conf": 0.7433756589889526, "adv/ratio_final_to_reasoning": 1.925715193965286, "adv/ratio_step_to_reasoning": 1.9134292581399694, "adv/std_final_conf": 0.9300759434700012, "adv/std_reasoning": 0.6815869808197021, "adv/std_step_conf": 0.9363304972648621, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.76171875, "calib/ece": 0.08035714285714282, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.03173860991020938, "calib/mean_conf": 0.6730555555555556, "calib/mu_c": 0.6608387096774194, "calib/mu_w": 0.6925773195876288, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06916666666666664, "calib/std_conf": 0.06814280121938554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5856571978815599, "calib/step_q_c_n": 2077.0, "calib/step_q_gap": -0.02949556357554939, "calib/step_q_w": 0.6151527614571093, "calib/step_q_w_n": 1702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 882.81640625, "completions/mean_terminated_length": 896.8294067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.20586666666666667, "grad_norm": 0.16130997240543365, "kl": 0.046909332275390625, "learning_rate": 2.2222222222222224e-07, "loss": -0.0396, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01840098574757576, "mask/share_reasoning": 0.7815666198730469, "mask/share_step_conf": 0.18440741300582886, "num_tokens": 60082815.0, "reward": 0.6326255798339844, "reward_std": 0.2460542917251587, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7286441326141357, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.21863830089569092, "step": 193 }, { "adv/mean_abs_final_conf": 0.7442784905433655, "adv/mean_abs_reasoning": 0.22733885049819946, "adv/mean_abs_step_conf": 0.769699215888977, "adv/ratio_final_to_reasoning": 3.2738728506470576, "adv/ratio_step_to_reasoning": 3.3856915094020548, "adv/std_final_conf": 0.9278947710990906, "adv/std_reasoning": 0.4959554374217987, "adv/std_step_conf": 0.9358815550804138, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 13.81640625, "calib/ece": 0.11011718749999992, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.022809941520468113, "calib/mean_conf": 0.6673828125000001, "calib/mu_c": 0.660611111111111, "calib/mu_w": 0.6834210526315792, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0371875, "calib/std_conf": 0.059900519234726536, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.582438704028021, "calib/step_q_c_n": 2284.0, "calib/step_q_gap": -0.016707345453224054, "calib/step_q_w": 0.5991460494812451, "calib/step_q_w_n": 1253.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 863.06640625, "completions/mean_terminated_length": 869.8621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.20693333333333333, "grad_norm": 0.15819790959358215, "kl": 0.053924560546875, "learning_rate": 1.9444444444444447e-07, "loss": 0.0045, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01844419352710247, "mask/share_reasoning": 0.7916795611381531, "mask/share_step_conf": 0.1820637583732605, "num_tokens": 60409704.0, "reward": 0.7222098112106323, "reward_std": 0.1835189312696457, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7768715023994446, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.326923131942749, "step": 194 }, { "adv/mean_abs_final_conf": 0.7512425184249878, "adv/mean_abs_reasoning": 0.3248511850833893, "adv/mean_abs_step_conf": 0.7509489059448242, "adv/ratio_final_to_reasoning": 2.31257435072045, "adv/ratio_step_to_reasoning": 2.3116705138448417, "adv/std_final_conf": 0.9297266602516174, "adv/std_reasoning": 0.6185224056243896, "adv/std_step_conf": 0.9359630346298218, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 14.57421875, "calib/ece": 0.07638888888888887, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.025153632280601745, "calib/mean_conf": 0.6701190476190476, "calib/mu_c": 0.6618343195266272, "calib/mu_w": 0.6869879518072289, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03793650793650793, "calib/std_conf": 0.06463255201556993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5865831435079727, "calib/step_q_c_n": 2195.0, "calib/step_q_gap": -0.0378113877420273, "calib/step_q_w": 0.62439453125, "calib/step_q_w_n": 1536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 881.12109375, "completions/mean_terminated_length": 895.107177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.208, "grad_norm": 0.18600162863731384, "kl": 0.053314208984375, "learning_rate": 1.6666666666666668e-07, "loss": -0.0785, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018634159117937088, "mask/share_reasoning": 0.7847827672958374, "mask/share_step_conf": 0.18095804750919342, "num_tokens": 60741255.0, "reward": 0.6997027397155762, "reward_std": 0.19091691076755524, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7518917918205261, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3186074495315552, "step": 195 }, { "adv/mean_abs_final_conf": 0.74583899974823, "adv/mean_abs_reasoning": 0.23860622942447662, "adv/mean_abs_step_conf": 0.760269820690155, "adv/ratio_final_to_reasoning": 3.1258152880048846, "adv/ratio_step_to_reasoning": 3.186294936741351, "adv/std_final_conf": 0.9270501136779785, "adv/std_reasoning": 0.5227808356285095, "adv/std_step_conf": 0.9355534315109253, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.80859375, "calib/ece": 0.094, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.006836344151867846, "calib/mean_conf": 0.6485882352941176, "calib/mu_c": 0.6466847826086957, "calib/mu_w": 0.6535211267605635, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.010509803921568622, "calib/std_conf": 0.03978825152769234, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5722817836812145, "calib/step_q_c_n": 2108.0, "calib/step_q_gap": -0.007597997739550388, "calib/step_q_w": 0.5798797814207649, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 740.484375, "completions/mean_terminated_length": 746.31494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 287.0, "epoch": 0.20906666666666668, "grad_norm": 0.15738339722156525, "kl": 0.0513763427734375, "learning_rate": 1.3888888888888888e-07, "loss": 0.0058, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020728375762701035, "mask/share_reasoning": 0.7827023267745972, "mask/share_step_conf": 0.1887567937374115, "num_tokens": 61033363.0, "reward": 0.7170915603637695, "reward_std": 0.20412716269493103, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7863527536392212, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.30486172437667847, "step": 196 }, { "adv/mean_abs_final_conf": 0.7726929187774658, "adv/mean_abs_reasoning": 0.4176172614097595, "adv/mean_abs_step_conf": 0.7945266962051392, "adv/ratio_final_to_reasoning": 1.8502418127284055, "adv/ratio_step_to_reasoning": 1.9025236014503768, "adv/std_final_conf": 0.9308398365974426, "adv/std_reasoning": 0.6816394925117493, "adv/std_step_conf": 0.9358987212181091, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.921875, "calib/ece": 0.09376470588235293, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.01872598162071848, "calib/mean_conf": 0.6648235294117646, "calib/mu_c": 0.6586549707602339, "calib/mu_w": 0.6773809523809524, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.044, "calib/std_conf": 0.060149065118271595, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5869473684210527, "calib/step_q_c_n": 2280.0, "calib/step_q_gap": -0.014867273323495689, "calib/step_q_w": 0.6018146417445484, "calib/step_q_w_n": 1284.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 862.703125, "completions/mean_terminated_length": 869.4960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.21013333333333334, "grad_norm": 0.20018146932125092, "kl": 0.047637939453125, "learning_rate": 1.1111111111111112e-07, "loss": -0.0118, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0184207484126091, "mask/share_reasoning": 0.7925665378570557, "mask/share_step_conf": 0.18120017647743225, "num_tokens": 61359271.0, "reward": 0.7343250513076782, "reward_std": 0.2549676299095154, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7641792893409729, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3716582655906677, "step": 197 }, { "adv/mean_abs_final_conf": 0.7752957344055176, "adv/mean_abs_reasoning": 0.3700793981552124, "adv/mean_abs_step_conf": 0.7856580018997192, "adv/ratio_final_to_reasoning": 2.0949443234890808, "adv/ratio_step_to_reasoning": 2.122944443317031, "adv/std_final_conf": 0.9272284507751465, "adv/std_reasoning": 0.6402872800827026, "adv/std_step_conf": 0.936040997505188, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.84765625, "calib/ece": 0.09941176470588234, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0008826358826359293, "calib/mean_conf": 0.6552549019607842, "calib/mu_c": 0.655026455026455, "calib/mu_w": 0.6559090909090909, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.006745098039215728, "calib/std_conf": 0.053466442888848206, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5803194619588062, "calib/step_q_c_n": 2379.0, "calib/step_q_gap": -0.00581240617306189, "calib/step_q_w": 0.5861318681318681, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 808.09375, "completions/mean_terminated_length": 814.4566650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.2112, "grad_norm": 0.1991894245147705, "kl": 0.04837799072265625, "learning_rate": 8.333333333333334e-08, "loss": 0.0257, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020755808800458908, "mask/share_reasoning": 0.7868603467941284, "mask/share_step_conf": 0.18457132577896118, "num_tokens": 61671527.0, "reward": 0.7754865884780884, "reward_std": 0.22402167320251465, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7944706678390503, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.40962737798690796, "step": 198 }, { "adv/mean_abs_final_conf": 0.7386778593063354, "adv/mean_abs_reasoning": 0.3966127038002014, "adv/mean_abs_step_conf": 0.7696336507797241, "adv/ratio_final_to_reasoning": 1.8624664621899092, "adv/ratio_step_to_reasoning": 1.9405168906728631, "adv/std_final_conf": 0.9307670593261719, "adv/std_reasoning": 0.6816592216491699, "adv/std_step_conf": 0.9359711408615112, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 13.48046875, "calib/ece": 0.19595238095238102, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.036653708133971286, "calib/mean_conf": 0.6711111111111111, "calib/mu_c": 0.6600568181818182, "calib/mu_w": 0.6967105263157894, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08432539682539683, "calib/std_conf": 0.06608943221988206, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5846520146520147, "calib/step_q_c_n": 2184.0, "calib/step_q_gap": -0.021898893003865383, "calib/step_q_w": 0.60655090765588, "calib/step_q_w_n": 1267.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 898.6640625, "completions/mean_terminated_length": 912.9286499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.21226666666666666, "grad_norm": 0.16708968579769135, "kl": 0.053680419921875, "learning_rate": 5.555555555555556e-08, "loss": -0.0758, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018384799361228943, "mask/share_reasoning": 0.7842874526977539, "mask/share_step_conf": 0.18170276284217834, "num_tokens": 62005785.0, "reward": 0.7263771891593933, "reward_std": 0.23604628443717957, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7568007707595825, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3615786135196686, "step": 199 }, { "adv/mean_abs_final_conf": 0.7753833532333374, "adv/mean_abs_reasoning": 0.21887263655662537, "adv/mean_abs_step_conf": 0.7630901336669922, "adv/ratio_final_to_reasoning": 3.5426235340877574, "adv/ratio_step_to_reasoning": 3.4864574469982696, "adv/std_final_conf": 0.9273792505264282, "adv/std_reasoning": 0.4960128366947174, "adv/std_step_conf": 0.935178816318512, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 14.609375, "calib/ece": 0.22654618473895588, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.06397883597883591, "calib/mean_conf": 0.669437751004016, "calib/mu_c": 0.6540211640211641, "calib/mu_w": 0.718, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06847389558232936, "calib/std_conf": 0.06724412412605409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5762603757099171, "calib/step_q_c_n": 2289.0, "calib/step_q_gap": -0.07003872835624414, "calib/step_q_w": 0.6462991040661612, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 882.26171875, "completions/mean_terminated_length": 907.064208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.21333333333333335, "grad_norm": 0.14146360754966736, "kl": 0.046146392822265625, "learning_rate": 2.777777777777778e-08, "loss": -0.0666, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01773788221180439, "mask/share_reasoning": 0.7792631387710571, "mask/share_step_conf": 0.17565517127513885, "num_tokens": 62339692.0, "reward": 0.7361253499984741, "reward_std": 0.14766019582748413, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7597870826721191, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3702760934829712, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.03823093850282021, "train_runtime": 13687.4621, "train_samples_per_second": 3.741, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 62339692, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }