Files
PureRL-7B-v7-s2-corr-maskon/trainer_state.json
ModelHub XC b9d90b748d 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-7B-v7-s2-corr-maskon
Source: Original Platform
2026-05-28 23:08:24 +08:00

12044 lines
494 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.7557821869850159,
"adv/mean_abs_reasoning": 0.28040462732315063,
"adv/mean_abs_step_conf": 0.7646071910858154,
"adv/ratio_final_to_reasoning": 2.69532708571895,
"adv/ratio_step_to_reasoning": 2.7267994768312023,
"adv/std_final_conf": 0.9257818460464478,
"adv/std_reasoning": 0.5727222561836243,
"adv/std_step_conf": 0.9352434873580933,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.59765625,
"calib/ece": 0.23243902439024394,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.008130081300813009,
"calib/gap": -0.04614489795918364,
"calib/mean_conf": 0.6646341463414636,
"calib/mu_c": 0.6552551020408164,
"calib/mu_w": 0.7014,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05016260162601624,
"calib/std_conf": 0.05917169015101882,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.583372,
"calib/step_q_c_n": 2500.0,
"calib/step_q_gap": -0.0778082748585287,
"calib/step_q_w": 0.6611802748585287,
"calib/step_q_w_n": 1237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1943.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 750.2265625,
"completions/mean_terminated_length": 780.7235717773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.3157978057861328,
"kl": 0.00047022104263305664,
"learning_rate": 0.0,
"loss": -0.094,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01929234340786934,
"mask/share_reasoning": 0.7498296499252319,
"mask/share_step_conf": 0.19181546568870544,
"num_tokens": 299642.0,
"reward": 0.7782012224197388,
"reward_std": 0.1634182333946228,
"rewards/accuracy_reward_step": 0.765625,
"rewards/final_brier_reward_step": 0.7708241939544678,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.4402656555175781,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7929245233535767,
"adv/mean_abs_reasoning": 0.4050842523574829,
"adv/mean_abs_step_conf": 0.768415093421936,
"adv/ratio_final_to_reasoning": 1.9574311238685933,
"adv/ratio_step_to_reasoning": 1.8969265997134275,
"adv/std_final_conf": 0.9301473498344421,
"adv/std_reasoning": 0.6612725853919983,
"adv/std_step_conf": 0.9350779056549072,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.078125,
"calib/ece": 0.04704724409448811,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.008169981916817282,
"calib/mean_conf": 0.6691732283464566,
"calib/mu_c": 0.6717142857142857,
"calib/mu_w": 0.6635443037974684,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.013622047244094477,
"calib/std_conf": 0.060200661111313364,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5911686697057605,
"calib/step_q_c_n": 2413.0,
"calib/step_q_gap": -0.011375410898773475,
"calib/step_q_w": 0.602544080604534,
"calib/step_q_w_n": 1191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2563.0,
"completions/max_terminated_length": 2563.0,
"completions/mean_length": 867.8828125,
"completions/mean_terminated_length": 871.2863159179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.5620657801628113,
"kl": 0.0006206929683685303,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.047,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01878987066447735,
"mask/share_reasoning": 0.7856365442276001,
"mask/share_step_conf": 0.1916673481464386,
"num_tokens": 625108.0,
"reward": 0.7644762992858887,
"reward_std": 0.18184059858322144,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7790628671646118,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.4147334694862366,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7465832233428955,
"adv/mean_abs_reasoning": 0.32673323154449463,
"adv/mean_abs_step_conf": 0.7574703693389893,
"adv/ratio_final_to_reasoning": 2.2849932338187204,
"adv/ratio_step_to_reasoning": 2.3183144419022366,
"adv/std_final_conf": 0.9279414415359497,
"adv/std_reasoning": 0.6184476613998413,
"adv/std_step_conf": 0.9352378845214844,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.43359375,
"calib/ece": 0.17269531249999992,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0078125,
"calib/gap": -0.02415453527435607,
"calib/mean_conf": 0.6649609375000001,
"calib/mu_c": 0.6605263157894737,
"calib/mu_w": 0.6846808510638298,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010625000000000002,
"calib/std_conf": 0.056692688894786895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5873742196107234,
"calib/step_q_c_n": 2723.0,
"calib/step_q_gap": -0.01493024966301959,
"calib/step_q_w": 0.602304469273743,
"calib/step_q_w_n": 716.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2305.0,
"completions/max_terminated_length": 2305.0,
"completions/mean_length": 795.03125,
"completions/mean_terminated_length": 801.2913208007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.0032,
"grad_norm": 0.33921700716018677,
"kl": 0.0004811286926269531,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0258,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019342608749866486,
"mask/share_reasoning": 0.77967369556427,
"mask/share_step_conf": 0.1931712031364441,
"num_tokens": 933892.0,
"reward": 0.8321274518966675,
"reward_std": 0.2148733288049698,
"rewards/accuracy_reward_step": 0.81640625,
"rewards/final_brier_reward_step": 0.8167222738265991,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.4842514991760254,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7097917199134827,
"adv/mean_abs_reasoning": 0.24592944979667664,
"adv/mean_abs_step_conf": 0.7488648891448975,
"adv/ratio_final_to_reasoning": 2.8861599149687294,
"adv/ratio_step_to_reasoning": 3.045039501222912,
"adv/std_final_conf": 0.9128405451774597,
"adv/std_reasoning": 0.5481777787208557,
"adv/std_step_conf": 0.9354363083839417,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.29296875,
"calib/ece": 0.11666666666666665,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0196078431372549,
"calib/gap": -0.02782347282347286,
"calib/mean_conf": 0.6761960784313725,
"calib/mu_c": 0.668994708994709,
"calib/mu_w": 0.6968181818181819,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.025843137254901963,
"calib/std_conf": 0.06795067980956544,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5909930178432894,
"calib/step_q_c_n": 2578.0,
"calib/step_q_gap": -0.021513920177061996,
"calib/step_q_w": 0.6125069380203514,
"calib/step_q_w_n": 1081.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2352.0,
"completions/max_terminated_length": 2352.0,
"completions/mean_length": 868.8359375,
"completions/mean_terminated_length": 875.6771850585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.2606043219566345,
"kl": 0.0006146430969238281,
"learning_rate": 7.5e-07,
"loss": 0.0218,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.018096409738063812,
"mask/share_reasoning": 0.7837621569633484,
"mask/share_step_conf": 0.190328910946846,
"num_tokens": 1262482.0,
"reward": 0.7609648704528809,
"reward_std": 0.15543901920318604,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7855706810951233,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.38948407769203186,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7735724449157715,
"adv/mean_abs_reasoning": 0.2758006453514099,
"adv/mean_abs_step_conf": 0.771937906742096,
"adv/ratio_final_to_reasoning": 2.804824636759382,
"adv/ratio_step_to_reasoning": 2.7988981162771953,
"adv/std_final_conf": 0.9277228713035583,
"adv/std_reasoning": 0.5482957363128662,
"adv/std_step_conf": 0.9349169731140137,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.0859375,
"calib/ece": 0.10850980392156867,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.023529411764705882,
"calib/gap": -0.046050398885652766,
"calib/mean_conf": 0.6789411764705883,
"calib/mu_c": 0.6597986577181209,
"calib/mu_w": 0.7058490566037736,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1015686274509804,
"calib/std_conf": 0.0810425721493136,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5856219445953286,
"calib/step_q_c_n": 1841.0,
"calib/step_q_gap": -0.03614009506472804,
"calib/step_q_w": 0.6217620396600566,
"calib/step_q_w_n": 1765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 872.9921875,
"completions/mean_terminated_length": 879.8661499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.005333333333333333,
"grad_norm": 1.078272819519043,
"kl": 0.001311957836151123,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0172,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019209835678339005,
"mask/share_reasoning": 0.7786589860916138,
"mask/share_step_conf": 0.19431866705417633,
"num_tokens": 1592656.0,
"reward": 0.5782305002212524,
"reward_std": 0.2045661062002182,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7164065837860107,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.12442938983440399,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7472907304763794,
"adv/mean_abs_reasoning": 0.18372581899166107,
"adv/mean_abs_step_conf": 0.7483274340629578,
"adv/ratio_final_to_reasoning": 4.067423591184522,
"adv/ratio_step_to_reasoning": 4.073066258025078,
"adv/std_final_conf": 0.9264699220657349,
"adv/std_reasoning": 0.4374311864376068,
"adv/std_step_conf": 0.9353809356689453,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.9296875,
"calib/ece": 0.0898617187500001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.00648107380744134,
"calib/mean_conf": 0.65548671875,
"calib/mu_c": 0.6532335329341318,
"calib/mu_w": 0.6597146067415731,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.046502343749999994,
"calib/std_conf": 0.06676177666605644,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.580767754318618,
"calib/step_q_c_n": 2084.0,
"calib/step_q_gap": -0.01605483948236075,
"calib/step_q_w": 0.5968225938009788,
"calib/step_q_w_n": 1226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1984.0,
"completions/max_terminated_length": 1984.0,
"completions/mean_length": 756.47265625,
"completions/mean_terminated_length": 762.4291381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.0064,
"grad_norm": 0.29219964146614075,
"kl": 0.0009875297546386719,
"learning_rate": 1.25e-06,
"loss": 0.0021,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020911913365125656,
"mask/share_reasoning": 0.7737997770309448,
"mask/share_step_conf": 0.19747580587863922,
"num_tokens": 1892265.0,
"reward": 0.7232071757316589,
"reward_std": 0.14570766687393188,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7624897956848145,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.35423704981803894,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7696272134780884,
"adv/mean_abs_reasoning": 0.3594735264778137,
"adv/mean_abs_step_conf": 0.7798027992248535,
"adv/ratio_final_to_reasoning": 2.140984402993551,
"adv/ratio_step_to_reasoning": 2.1692913157347125,
"adv/std_final_conf": 0.9286594390869141,
"adv/std_reasoning": 0.6403165459632874,
"adv/std_step_conf": 0.9355884790420532,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 14.8515625,
"calib/ece": 0.09394422310756981,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0199203187250996,
"calib/gap": -0.004911071587372251,
"calib/mean_conf": 0.6777689243027889,
"calib/mu_c": 0.676242774566474,
"calib/mu_w": 0.6811538461538462,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.041235059760956226,
"calib/std_conf": 0.0627507114167244,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5945607553366175,
"calib/step_q_c_n": 2436.0,
"calib/step_q_gap": -0.028206448177291787,
"calib/step_q_w": 0.6227672035139092,
"calib/step_q_w_n": 1366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 888.43359375,
"completions/mean_terminated_length": 906.1315307617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.30397337675094604,
"kl": 0.000798642635345459,
"learning_rate": 1.5e-06,
"loss": -0.1192,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01730389893054962,
"mask/share_reasoning": 0.776395857334137,
"mask/share_step_conf": 0.1867690086364746,
"num_tokens": 2227128.0,
"reward": 0.6936839818954468,
"reward_std": 0.22909244894981384,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.764412522315979,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.2917054295539856,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7443795204162598,
"adv/mean_abs_reasoning": 0.359142541885376,
"adv/mean_abs_step_conf": 0.740021824836731,
"adv/ratio_final_to_reasoning": 2.0726576041605123,
"adv/ratio_step_to_reasoning": 2.060523994043893,
"adv/std_final_conf": 0.9140991568565369,
"adv/std_reasoning": 0.6403468251228333,
"adv/std_step_conf": 0.9353559613227844,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.953125,
"calib/ece": 0.14135999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.008,
"calib/gap": -0.03475222816399304,
"calib/mean_conf": 0.6742400000000001,
"calib/mu_c": 0.6624242424242424,
"calib/mu_w": 0.6971764705882354,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07780000000000002,
"calib/std_conf": 0.06498324707184153,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5856710158434297,
"calib/step_q_c_n": 2146.0,
"calib/step_q_gap": -0.04366905550020872,
"calib/step_q_w": 0.6293400713436385,
"calib/step_q_w_n": 1682.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2260.0,
"completions/max_terminated_length": 2260.0,
"completions/mean_length": 857.8125,
"completions/mean_terminated_length": 878.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.4903322458267212,
"kl": 0.0005944967269897461,
"learning_rate": 1.75e-06,
"loss": -0.1395,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01849009469151497,
"mask/share_reasoning": 0.7680476903915405,
"mask/share_step_conf": 0.190024733543396,
"num_tokens": 2553240.0,
"reward": 0.6751306056976318,
"reward_std": 0.2096453160047531,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7378687262535095,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.28817370533943176,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7298851013183594,
"adv/mean_abs_reasoning": 0.2622373104095459,
"adv/mean_abs_step_conf": 0.7591660022735596,
"adv/ratio_final_to_reasoning": 2.7832999819074953,
"adv/ratio_step_to_reasoning": 2.894958009933603,
"adv/std_final_conf": 0.9278061985969543,
"adv/std_reasoning": 0.5727177262306213,
"adv/std_step_conf": 0.9351189732551575,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 15.0859375,
"calib/ece": 0.10218253968253967,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.027777777777777776,
"calib/gap": 0.011294309501330257,
"calib/mean_conf": 0.6726587301587301,
"calib/mu_c": 0.675392670157068,
"calib/mu_w": 0.6640983606557378,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.008452380952380949,
"calib/std_conf": 0.07093811999361678,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6009610575205431,
"calib/step_q_c_n": 2799.0,
"calib/step_q_gap": -0.02568080513232618,
"calib/step_q_w": 0.6266418626528693,
"calib/step_q_w_n": 1063.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2858.0,
"completions/max_terminated_length": 2858.0,
"completions/mean_length": 865.7734375,
"completions/mean_terminated_length": 876.03955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 373.0,
"epoch": 0.0096,
"grad_norm": 0.5019803047180176,
"kl": 0.0006263852119445801,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1137,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018472330644726753,
"mask/share_reasoning": 0.779818594455719,
"mask/share_step_conf": 0.1899903267621994,
"num_tokens": 2882414.0,
"reward": 0.7639858722686768,
"reward_std": 0.1735934615135193,
"rewards/accuracy_reward_step": 0.74609375,
"rewards/final_brier_reward_step": 0.7957402467727661,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3861377537250519,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7413267493247986,
"adv/mean_abs_reasoning": 0.3311035633087158,
"adv/mean_abs_step_conf": 0.7383732795715332,
"adv/ratio_final_to_reasoning": 2.238957327782054,
"adv/ratio_step_to_reasoning": 2.2300372493517546,
"adv/std_final_conf": 0.9291539192199707,
"adv/std_reasoning": 0.6403902173042297,
"adv/std_step_conf": 0.935117781162262,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 15.2578125,
"calib/ece": 0.18198380566801617,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.024613217213114602,
"calib/mean_conf": 0.6712955465587044,
"calib/mu_c": 0.6649180327868853,
"calib/mu_w": 0.6895312499999999,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.056194331983805676,
"calib/std_conf": 0.06283406026960563,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.589368,
"calib/step_q_c_n": 2500.0,
"calib/step_q_gap": -0.04469352204836419,
"calib/step_q_w": 0.6340615220483642,
"calib/step_q_w_n": 1406.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2515.0,
"completions/max_terminated_length": 2515.0,
"completions/mean_length": 857.03125,
"completions/mean_terminated_length": 884.6773681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.3038390278816223,
"kl": 0.0007747411727905273,
"learning_rate": 2.25e-06,
"loss": -0.1784,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01796487718820572,
"mask/share_reasoning": 0.7671234607696533,
"mask/share_step_conf": 0.18366166949272156,
"num_tokens": 3208614.0,
"reward": 0.7771371006965637,
"reward_std": 0.20995953679084778,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7620207071304321,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.4563159942626953,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7395666837692261,
"adv/mean_abs_reasoning": 0.2475532591342926,
"adv/mean_abs_step_conf": 0.7714335918426514,
"adv/ratio_final_to_reasoning": 2.987505340691258,
"adv/ratio_step_to_reasoning": 3.1162328241623527,
"adv/std_final_conf": 0.9268957376480103,
"adv/std_reasoning": 0.5482560396194458,
"adv/std_step_conf": 0.9356309771537781,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.7265625,
"calib/ece": 0.12937007874015752,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": -0.040216941411546414,
"calib/mean_conf": 0.6769291338582677,
"calib/mu_c": 0.6641040462427745,
"calib/mu_w": 0.7043209876543209,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06259842519685044,
"calib/std_conf": 0.06555651298927354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5852804141501294,
"calib/step_q_c_n": 2318.0,
"calib/step_q_gap": -0.036255398522046844,
"calib/step_q_w": 0.6215358126721763,
"calib/step_q_w_n": 1452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2209.0,
"completions/max_terminated_length": 2209.0,
"completions/mean_length": 871.37890625,
"completions/mean_terminated_length": 881.7114868164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.25886476039886475,
"kl": 0.0006725192070007324,
"learning_rate": 2.5e-06,
"loss": -0.0581,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01826527714729309,
"mask/share_reasoning": 0.7783842086791992,
"mask/share_step_conf": 0.1916317641735077,
"num_tokens": 3536167.0,
"reward": 0.6843187808990479,
"reward_std": 0.19306161999702454,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7550671696662903,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.27997660636901855,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7612889409065247,
"adv/mean_abs_reasoning": 0.41153576970100403,
"adv/mean_abs_step_conf": 0.7721890211105347,
"adv/ratio_final_to_reasoning": 1.8498730777633965,
"adv/ratio_step_to_reasoning": 1.8763594272049755,
"adv/std_final_conf": 0.9294129014015198,
"adv/std_reasoning": 0.6818270087242126,
"adv/std_step_conf": 0.9352567195892334,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 15.1484375,
"calib/ece": 0.11770833333333333,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.008333333333333333,
"calib/gap": -0.01616477272727279,
"calib/mean_conf": 0.6647083333333332,
"calib/mu_c": 0.6603977272727272,
"calib/mu_w": 0.6765625,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.024541666666666663,
"calib/std_conf": 0.06142677155678043,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5832122286220647,
"calib/step_q_c_n": 2257.0,
"calib/step_q_gap": -0.06902657458583183,
"calib/step_q_w": 0.6522388032078965,
"calib/step_q_w_n": 1621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 747.890625,
"completions/mean_terminated_length": 794.4398803710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.0128,
"grad_norm": 0.27160966396331787,
"kl": 0.0008394122123718262,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.1756,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.0197913758456707,
"mask/share_reasoning": 0.7317686080932617,
"mask/share_step_conf": 0.18984632194042206,
"num_tokens": 3831803.0,
"reward": 0.7235974669456482,
"reward_std": 0.21748779714107513,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7402870655059814,
"rewards/format_reward_step": 0.9375,
"rewards/step_correlation_reward": 0.38190776109695435,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7387239336967468,
"adv/mean_abs_reasoning": 0.27716708183288574,
"adv/mean_abs_step_conf": 0.7305924892425537,
"adv/ratio_final_to_reasoning": 2.665265762483839,
"adv/ratio_step_to_reasoning": 2.6359280633587465,
"adv/std_final_conf": 0.9279195070266724,
"adv/std_reasoning": 0.5726190209388733,
"adv/std_step_conf": 0.9355447292327881,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.41796875,
"calib/ece": 0.11050980392156862,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": 0.0006725146198829357,
"calib/mean_conf": 0.6659607843137255,
"calib/mu_c": 0.666111111111111,
"calib/mu_w": 0.665438596491228,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05874302988584404,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5896403978576894,
"calib/step_q_c_n": 2614.0,
"calib/step_q_gap": -0.009105034541823431,
"calib/step_q_w": 0.5987454323995128,
"calib/step_q_w_n": 821.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2503.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 833.36328125,
"completions/mean_terminated_length": 836.6314086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 237.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.334152489900589,
"kl": 0.0010883808135986328,
"learning_rate": 3e-06,
"loss": 0.0129,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01961558125913143,
"mask/share_reasoning": 0.7841991782188416,
"mask/share_step_conf": 0.19227902591228485,
"num_tokens": 4149736.0,
"reward": 0.827056884765625,
"reward_std": 0.16083455085754395,
"rewards/accuracy_reward_step": 0.7734375,
"rewards/final_brier_reward_step": 0.8078383207321167,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.49236929416656494,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.764909565448761,
"adv/mean_abs_reasoning": 0.4123122990131378,
"adv/mean_abs_step_conf": 0.7722538113594055,
"adv/ratio_final_to_reasoning": 1.8551703824493193,
"adv/ratio_step_to_reasoning": 1.8729827201560112,
"adv/std_final_conf": 0.9291587471961975,
"adv/std_reasoning": 0.6817663311958313,
"adv/std_step_conf": 0.9356257319450378,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 14.640625,
"calib/ece": 0.08677419354838706,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.012096774193548387,
"calib/gap": -0.009504283965728222,
"calib/mean_conf": 0.6731451612903225,
"calib/mu_c": 0.6702325581395349,
"calib/mu_w": 0.6797368421052631,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03318548387096776,
"calib/std_conf": 0.06165548055962667,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5928045685279189,
"calib/step_q_c_n": 2364.0,
"calib/step_q_gap": -0.03080814823508693,
"calib/step_q_w": 0.6236127167630058,
"calib/step_q_w_n": 1384.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 912.83984375,
"completions/mean_terminated_length": 934.748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.32500895857810974,
"kl": 0.001935720443725586,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01796538010239601,
"mask/share_reasoning": 0.7767523527145386,
"mask/share_step_conf": 0.18184472620487213,
"num_tokens": 4488823.0,
"reward": 0.7065720558166504,
"reward_std": 0.2284417450428009,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7548531293869019,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.3301659822463989,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7412395477294922,
"adv/mean_abs_reasoning": 0.4259348511695862,
"adv/mean_abs_step_conf": 0.7584189176559448,
"adv/ratio_final_to_reasoning": 1.7402650797278085,
"adv/ratio_step_to_reasoning": 1.7805984074169594,
"adv/std_final_conf": 0.9293286204338074,
"adv/std_reasoning": 0.7205248475074768,
"adv/std_step_conf": 0.9360866546630859,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.19921875,
"calib/ece": 0.09861660079051382,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.011857707509881422,
"calib/gap": -0.025413377648524915,
"calib/mean_conf": 0.6684980237154151,
"calib/mu_c": 0.6597590361445784,
"calib/mu_w": 0.6851724137931033,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05549407114624505,
"calib/std_conf": 0.06361411734000602,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5882619047619048,
"calib/step_q_c_n": 2100.0,
"calib/step_q_gap": -0.026754381883046352,
"calib/step_q_w": 0.6150162866449511,
"calib/step_q_w_n": 1535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2325.0,
"completions/max_terminated_length": 2325.0,
"completions/mean_length": 820.140625,
"completions/mean_terminated_length": 829.8656616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.016,
"grad_norm": 0.40916433930397034,
"kl": 0.004666328430175781,
"learning_rate": 3.5e-06,
"loss": -0.0619,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019354475662112236,
"mask/share_reasoning": 0.7726200819015503,
"mask/share_step_conf": 0.19630667567253113,
"num_tokens": 4806659.0,
"reward": 0.7369135618209839,
"reward_std": 0.24567002058029175,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7498167753219604,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.39666664600372314,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7580640912055969,
"adv/mean_abs_reasoning": 0.30199116468429565,
"adv/mean_abs_step_conf": 0.7372256517410278,
"adv/ratio_final_to_reasoning": 2.5102194363801473,
"adv/ratio_step_to_reasoning": 2.441215962432975,
"adv/std_final_conf": 0.9269432425498962,
"adv/std_reasoning": 0.5960419178009033,
"adv/std_step_conf": 0.9359452128410339,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 15.15625,
"calib/ece": 0.12853174603174605,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003968253968253968,
"calib/gap": -0.019044067796610387,
"calib/mean_conf": 0.6870238095238095,
"calib/mu_c": 0.6813559322033897,
"calib/mu_w": 0.7004000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05658730158730163,
"calib/std_conf": 0.059116103718198997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5930035615354174,
"calib/step_q_c_n": 2527.0,
"calib/step_q_gap": -0.024141006092076922,
"calib/step_q_w": 0.6171445676274944,
"calib/step_q_w_n": 1353.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2312.0,
"completions/max_terminated_length": 2312.0,
"completions/mean_length": 995.6875,
"completions/mean_terminated_length": 1015.52197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 319.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.2688722014427185,
"kl": 0.003340482711791992,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0806,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.015878742560744286,
"mask/share_reasoning": 0.7861171960830688,
"mask/share_step_conf": 0.17847280204296112,
"num_tokens": 5170403.0,
"reward": 0.7015179991722107,
"reward_std": 0.21285340189933777,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7670894861221313,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3007902503013611,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7609270811080933,
"adv/mean_abs_reasoning": 0.2718398869037628,
"adv/mean_abs_step_conf": 0.7644103169441223,
"adv/ratio_final_to_reasoning": 2.7991737701740504,
"adv/ratio_step_to_reasoning": 2.8119873269912743,
"adv/std_final_conf": 0.9288200736045837,
"adv/std_reasoning": 0.5483255982398987,
"adv/std_step_conf": 0.93514484167099,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.578125,
"calib/ece": 0.12751968503937006,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.007840983760442954,
"calib/mean_conf": 0.6712992125984253,
"calib/mu_c": 0.6729353233830845,
"calib/mu_w": 0.6650943396226415,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.003740157480314958,
"calib/std_conf": 0.062237520010181915,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.594040331292762,
"calib/step_q_c_n": 2777.0,
"calib/step_q_gap": -0.00634593480165857,
"calib/step_q_w": 0.6003862660944206,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1947.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 804.3828125,
"completions/mean_terminated_length": 813.9209594726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.2445593923330307,
"kl": 0.007335186004638672,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0116,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.020442228764295578,
"mask/share_reasoning": 0.7655640840530396,
"mask/share_step_conf": 0.20227494835853577,
"num_tokens": 5479853.0,
"reward": 0.8299559354782104,
"reward_std": 0.1681516319513321,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.8127847909927368,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.49165841937065125,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.73135906457901,
"adv/mean_abs_reasoning": 0.3147658407688141,
"adv/mean_abs_step_conf": 0.7463362812995911,
"adv/ratio_final_to_reasoning": 2.3235020127745405,
"adv/ratio_step_to_reasoning": 2.3710841032707624,
"adv/std_final_conf": 0.9284005165100098,
"adv/std_reasoning": 0.6185328960418701,
"adv/std_step_conf": 0.9355065822601318,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 15.140625,
"calib/ece": 0.12024390243902439,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.012195121951219513,
"calib/gap": -0.033293650793650875,
"calib/mean_conf": 0.6708130081300813,
"calib/mu_c": 0.6594444444444444,
"calib/mu_w": 0.6927380952380953,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.066260162601626,
"calib/std_conf": 0.07266241685120631,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5880245746691871,
"calib/step_q_c_n": 2116.0,
"calib/step_q_gap": -0.0349936071489948,
"calib/step_q_w": 0.6230181818181819,
"calib/step_q_w_n": 1760.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2498.0,
"completions/max_terminated_length": 2498.0,
"completions/mean_length": 849.63671875,
"completions/mean_terminated_length": 880.5951538085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.0192,
"grad_norm": 0.27101266384124756,
"kl": 0.00897979736328125,
"learning_rate": 4.25e-06,
"loss": -0.0554,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.018169566988945007,
"mask/share_reasoning": 0.7656527757644653,
"mask/share_step_conf": 0.18102139234542847,
"num_tokens": 5808080.0,
"reward": 0.6677666306495667,
"reward_std": 0.19289115071296692,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7252484560012817,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.29153478145599365,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7627384662628174,
"adv/mean_abs_reasoning": 0.2930169403553009,
"adv/mean_abs_step_conf": 0.7524563074111938,
"adv/ratio_final_to_reasoning": 2.6030524560728487,
"adv/ratio_step_to_reasoning": 2.5679617925803018,
"adv/std_final_conf": 0.9256052374839783,
"adv/std_reasoning": 0.5726289749145508,
"adv/std_step_conf": 0.9351460337638855,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.04296875,
"calib/ece": 0.08541176470588227,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007218903986859537,
"calib/mean_conf": 0.6444705882352941,
"calib/mu_c": 0.6423756906077349,
"calib/mu_w": 0.6495945945945945,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010039215686274503,
"calib/std_conf": 0.033984052165902216,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.572906704172527,
"calib/step_q_c_n": 2133.0,
"calib/step_q_gap": -0.0059038221432624605,
"calib/step_q_w": 0.5788105263157894,
"calib/step_q_w_n": 950.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 724.125,
"completions/mean_terminated_length": 726.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.5130786895751953,
"kl": 0.014491081237792969,
"learning_rate": 4.5e-06,
"loss": 0.0017,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02103264629840851,
"mask/share_reasoning": 0.7846888899803162,
"mask/share_step_conf": 0.19037219882011414,
"num_tokens": 6098216.0,
"reward": 0.7534011006355286,
"reward_std": 0.15472280979156494,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7825515270233154,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.38362565636634827,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7196226119995117,
"adv/mean_abs_reasoning": 0.1906130015850067,
"adv/mean_abs_step_conf": 0.760223388671875,
"adv/ratio_final_to_reasoning": 3.7753070672809548,
"adv/ratio_step_to_reasoning": 3.98830815500716,
"adv/std_final_conf": 0.9120534658432007,
"adv/std_reasoning": 0.4959540367126465,
"adv/std_step_conf": 0.9357298612594604,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.82421875,
"calib/ece": 0.19436507936507932,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.011904761904761904,
"calib/gap": -0.05569444444444449,
"calib/mean_conf": 0.653968253968254,
"calib/mu_c": 0.6380555555555556,
"calib/mu_w": 0.6937500000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06702380952380953,
"calib/std_conf": 0.07203481703457948,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5779971181556196,
"calib/step_q_c_n": 2082.0,
"calib/step_q_gap": -0.05537131012166252,
"calib/step_q_w": 0.6333684282772821,
"calib/step_q_w_n": 1457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2285.0,
"completions/max_terminated_length": 2285.0,
"completions/mean_length": 757.83984375,
"completions/mean_terminated_length": 769.8690795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.021333333333333333,
"grad_norm": 2.31535005569458,
"kl": 0.017386436462402344,
"learning_rate": 4.75e-06,
"loss": -0.0447,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022121768444776535,
"mask/share_reasoning": 0.7714666724205017,
"mask/share_step_conf": 0.19078657031059265,
"num_tokens": 6397095.0,
"reward": 0.7693119049072266,
"reward_std": 0.15373152494430542,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7524155974388123,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.4487082362174988,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7619816660881042,
"adv/mean_abs_reasoning": 0.36749473214149475,
"adv/mean_abs_step_conf": 0.7683507204055786,
"adv/ratio_final_to_reasoning": 2.0734492210210025,
"adv/ratio_step_to_reasoning": 2.09078022949658,
"adv/std_final_conf": 0.9265251755714417,
"adv/std_reasoning": 0.6403968930244446,
"adv/std_step_conf": 0.9350817799568176,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.81640625,
"calib/ece": 0.1926907630522088,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.01606425702811245,
"calib/gap": -0.047142412935323175,
"calib/mean_conf": 0.6492369477911647,
"calib/mu_c": 0.6401492537313433,
"calib/mu_w": 0.6872916666666665,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.017349397590361443,
"calib/std_conf": 0.06157096731729032,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5765693130399013,
"calib/step_q_c_n": 2431.0,
"calib/step_q_gap": -0.06329271227655442,
"calib/step_q_w": 0.6398620253164558,
"calib/step_q_w_n": 1106.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2517.0,
"completions/max_terminated_length": 2517.0,
"completions/mean_length": 707.16796875,
"completions/mean_terminated_length": 724.1400146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.0224,
"grad_norm": 0.5563187003135681,
"kl": 0.024248123168945312,
"learning_rate": 5e-06,
"loss": -0.0013,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02241862565279007,
"mask/share_reasoning": 0.7598947882652283,
"mask/share_step_conf": 0.19424910843372345,
"num_tokens": 6681090.0,
"reward": 0.7932220697402954,
"reward_std": 0.20893710851669312,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.7790640592575073,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.45581766963005066,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7331605553627014,
"adv/mean_abs_reasoning": 0.1721845418214798,
"adv/mean_abs_step_conf": 0.7609281539916992,
"adv/ratio_final_to_reasoning": 4.257992893013817,
"adv/ratio_step_to_reasoning": 4.4192593942644764,
"adv/std_final_conf": 0.9263607263565063,
"adv/std_reasoning": 0.46741846203804016,
"adv/std_step_conf": 0.9356198310852051,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.46484375,
"calib/ece": 0.08101562500000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0006332809342016743,
"calib/mean_conf": 0.641328125,
"calib/mu_c": 0.6411475409836065,
"calib/mu_w": 0.6417808219178082,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0037500000000000025,
"calib/std_conf": 0.044841231963276565,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5746539027982328,
"calib/step_q_c_n": 2037.0,
"calib/step_q_gap": -0.0007247163554421343,
"calib/step_q_w": 0.5753786191536749,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1462.0,
"completions/max_terminated_length": 1462.0,
"completions/mean_length": 684.60546875,
"completions/mean_terminated_length": 689.9960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.28116798400878906,
"kl": 0.0274505615234375,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02279532700777054,
"mask/share_reasoning": 0.7776881456375122,
"mask/share_step_conf": 0.19170401990413666,
"num_tokens": 6958165.0,
"reward": 0.7616775631904602,
"reward_std": 0.14698928594589233,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7884843945503235,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.3919020891189575,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7574082612991333,
"adv/mean_abs_reasoning": 0.27920395135879517,
"adv/mean_abs_step_conf": 0.7400485277175903,
"adv/ratio_final_to_reasoning": 2.712741913619319,
"adv/ratio_step_to_reasoning": 2.6505660973493175,
"adv/std_final_conf": 0.9250745177268982,
"adv/std_reasoning": 0.5725696086883545,
"adv/std_step_conf": 0.9337242245674133,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.47265625,
"calib/ece": 0.14820312500000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.002167384026781183,
"calib/mean_conf": 0.6525781249999999,
"calib/mu_c": 0.6521463414634147,
"calib/mu_w": 0.6543137254901958,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.06460512380209774,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5821149241819633,
"calib/step_q_c_n": 2506.0,
"calib/step_q_gap": -0.006706036516726632,
"calib/step_q_w": 0.5888209606986899,
"calib/step_q_w_n": 687.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1832.0,
"completions/max_terminated_length": 1832.0,
"completions/mean_length": 760.65625,
"completions/mean_terminated_length": 766.6456909179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.27971068024635315,
"kl": 0.02925872802734375,
"learning_rate": 4.944444444444445e-06,
"loss": -0.006,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022922851145267487,
"mask/share_reasoning": 0.779192328453064,
"mask/share_step_conf": 0.19007235765457153,
"num_tokens": 7256829.0,
"reward": 0.8002302646636963,
"reward_std": 0.1789833903312683,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8136398792266846,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.42666441202163696,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7580659985542297,
"adv/mean_abs_reasoning": 0.5364935398101807,
"adv/mean_abs_step_conf": 0.7590062618255615,
"adv/ratio_final_to_reasoning": 1.4130011683317654,
"adv/ratio_step_to_reasoning": 1.4147537770801661,
"adv/std_final_conf": 0.9311103820800781,
"adv/std_reasoning": 0.7753890752792358,
"adv/std_step_conf": 0.936369776725769,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.07421875,
"calib/ece": 0.08474308300395252,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.007905138339920948,
"calib/gap": -0.013661558960692988,
"calib/mean_conf": 0.6558893280632411,
"calib/mu_c": 0.650759493670886,
"calib/mu_w": 0.664421052631579,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05806324110671935,
"calib/std_conf": 0.07233335491967625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5837024972855592,
"calib/step_q_c_n": 1842.0,
"calib/step_q_gap": -0.030762618993510515,
"calib/step_q_w": 0.6144651162790697,
"calib/step_q_w_n": 1505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 770.25390625,
"completions/mean_terminated_length": 779.3873901367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.0256,
"grad_norm": 0.3319094479084015,
"kl": 0.032161712646484375,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0027,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02267495170235634,
"mask/share_reasoning": 0.7736426591873169,
"mask/share_step_conf": 0.19196362793445587,
"num_tokens": 7558526.0,
"reward": 0.6610881090164185,
"reward_std": 0.2659045457839966,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7440546751022339,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.257027804851532,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7323684692382812,
"adv/mean_abs_reasoning": 0.13908778131008148,
"adv/mean_abs_step_conf": 0.7595969438552856,
"adv/ratio_final_to_reasoning": 5.265512630513124,
"adv/ratio_step_to_reasoning": 5.461277307758937,
"adv/std_final_conf": 0.9252333641052246,
"adv/std_reasoning": 0.40499961376190186,
"adv/std_step_conf": 0.9354682564735413,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.078125,
"calib/ece": 0.13213438735177863,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.027592477944590632,
"calib/mean_conf": 0.6494466403162056,
"calib/mu_c": 0.6417032967032966,
"calib/mu_w": 0.6692957746478873,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.031106719367588943,
"calib/std_conf": 0.05357284316037042,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5738152610441767,
"calib/step_q_c_n": 1992.0,
"calib/step_q_gap": -0.033366557137641584,
"calib/step_q_w": 0.6071818181818183,
"calib/step_q_w_n": 1100.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 727.58984375,
"completions/mean_terminated_length": 733.3189086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.22611305117607117,
"kl": 0.039325714111328125,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0109,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02241634577512741,
"mask/share_reasoning": 0.7723724246025085,
"mask/share_step_conf": 0.19739872217178345,
"num_tokens": 7848013.0,
"reward": 0.7145950794219971,
"reward_std": 0.156108558177948,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7700910568237305,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.31925538182258606,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7534859776496887,
"adv/mean_abs_reasoning": 0.2689468264579773,
"adv/mean_abs_step_conf": 0.7727804183959961,
"adv/ratio_final_to_reasoning": 2.8016169128042128,
"adv/ratio_step_to_reasoning": 2.87335763940216,
"adv/std_final_conf": 0.9244814515113831,
"adv/std_reasoning": 0.5483368635177612,
"adv/std_step_conf": 0.9353242516517639,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 11.46484375,
"calib/ece": 0.1467588932806324,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.019890109890109864,
"calib/mean_conf": 0.635691699604743,
"calib/mu_c": 0.63010989010989,
"calib/mu_w": 0.6499999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03154150197628457,
"calib/std_conf": 0.038673864517475974,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5706611140031235,
"calib/step_q_c_n": 1921.0,
"calib/step_q_gap": -0.02551245601660035,
"calib/step_q_w": 0.5961735700197238,
"calib/step_q_w_n": 1014.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2875.0,
"completions/max_terminated_length": 2875.0,
"completions/mean_length": 676.83984375,
"completions/mean_terminated_length": 684.8656616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.3391321897506714,
"kl": 0.0453948974609375,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0522,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022081192582845688,
"mask/share_reasoning": 0.779906690120697,
"mask/share_step_conf": 0.186293363571167,
"num_tokens": 8126524.0,
"reward": 0.740322470664978,
"reward_std": 0.18615061044692993,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7724347710609436,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3683663606643677,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7368289232254028,
"adv/mean_abs_reasoning": 0.2924867272377014,
"adv/mean_abs_step_conf": 0.7530105113983154,
"adv/ratio_final_to_reasoning": 2.519187554882066,
"adv/ratio_step_to_reasoning": 2.5745117342926482,
"adv/std_final_conf": 0.9256953001022339,
"adv/std_reasoning": 0.5960046648979187,
"adv/std_step_conf": 0.9359559416770935,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.2421875,
"calib/ece": 0.08752362204724409,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0006899336791647004,
"calib/mean_conf": 0.6411377952755906,
"calib/mu_c": 0.6414012738853503,
"calib/mu_w": 0.6407113402061856,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05527559055118106,
"calib/std_conf": 0.052515920989200174,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5727674129353234,
"calib/step_q_c_n": 1608.0,
"calib/step_q_gap": -0.012955421710345916,
"calib/step_q_w": 0.5857228346456693,
"calib/step_q_w_n": 1270.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1987.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 720.60546875,
"completions/mean_terminated_length": 726.279541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 309.0,
"epoch": 0.0288,
"grad_norm": 0.2796003818511963,
"kl": 0.049957275390625,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0482,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022812340408563614,
"mask/share_reasoning": 0.7833442687988281,
"mask/share_step_conf": 0.18603089451789856,
"num_tokens": 8416215.0,
"reward": 0.7067053318023682,
"reward_std": 0.1922786831855774,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.755042314529419,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.3372744917869568,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7352758049964905,
"adv/mean_abs_reasoning": 0.33253324031829834,
"adv/mean_abs_step_conf": 0.7541956901550293,
"adv/ratio_final_to_reasoning": 2.2111347554087826,
"adv/ratio_step_to_reasoning": 2.268030977694497,
"adv/std_final_conf": 0.9242193102836609,
"adv/std_reasoning": 0.6185281872749329,
"adv/std_step_conf": 0.9362373352050781,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.30078125,
"calib/ece": 0.1744313725490196,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": -0.032871794871795035,
"calib/mean_conf": 0.6395294117647058,
"calib/mu_c": 0.6317948717948717,
"calib/mu_w": 0.6646666666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.024627450980392148,
"calib/std_conf": 0.06496284094446263,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5762992125984252,
"calib/step_q_c_n": 2032.0,
"calib/step_q_gap": -0.0264766294457095,
"calib/step_q_w": 0.6027758420441347,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2309.0,
"completions/max_terminated_length": 2309.0,
"completions/mean_length": 732.38671875,
"completions/mean_terminated_length": 738.153564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.33571648597717285,
"kl": 0.046184539794921875,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0196,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.023265177384018898,
"mask/share_reasoning": 0.7898497581481934,
"mask/share_step_conf": 0.1790725439786911,
"num_tokens": 8710650.0,
"reward": 0.7391840219497681,
"reward_std": 0.20819517970085144,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7852710485458374,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.34153443574905396,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7568809390068054,
"adv/mean_abs_reasoning": 0.33271366357803345,
"adv/mean_abs_step_conf": 0.7712277770042419,
"adv/ratio_final_to_reasoning": 2.2748718248214934,
"adv/ratio_step_to_reasoning": 2.317992500549533,
"adv/std_final_conf": 0.9278278350830078,
"adv/std_reasoning": 0.6184771060943604,
"adv/std_step_conf": 0.9358929395675659,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.52734375,
"calib/ece": 0.0757905138339921,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.013584249084249045,
"calib/mean_conf": 0.6536166007905139,
"calib/mu_c": 0.6494285714285715,
"calib/mu_w": 0.6630128205128205,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.018853754940711443,
"calib/std_conf": 0.059171346629624394,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5832805219012116,
"calib/step_q_c_n": 2146.0,
"calib/step_q_gap": -0.005145491293887305,
"calib/step_q_w": 0.5884260131950989,
"calib/step_q_w_n": 1061.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2006.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 795.890625,
"completions/mean_terminated_length": 805.328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.282016396522522,
"kl": 0.0450897216796875,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0624,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.020920034497976303,
"mask/share_reasoning": 0.7820408940315247,
"mask/share_step_conf": 0.18532030284404755,
"num_tokens": 9021526.0,
"reward": 0.7136837244033813,
"reward_std": 0.19021692872047424,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.766909658908844,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3260827660560608,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7519515752792358,
"adv/mean_abs_reasoning": 0.3583611845970154,
"adv/mean_abs_step_conf": 0.7747514843940735,
"adv/ratio_final_to_reasoning": 2.098306422680294,
"adv/ratio_step_to_reasoning": 2.1619291309835846,
"adv/std_final_conf": 0.9269018173217773,
"adv/std_reasoning": 0.6402735710144043,
"adv/std_step_conf": 0.9351660013198853,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.05859375,
"calib/ece": 0.1345634920634921,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003968253968253968,
"calib/gap": -0.014091233071988674,
"calib/mean_conf": 0.6510714285714285,
"calib/mu_c": 0.6472131147540984,
"calib/mu_w": 0.661304347826087,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02972222222222222,
"calib/std_conf": 0.060310397978361824,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.585917667238422,
"calib/step_q_c_n": 2332.0,
"calib/step_q_gap": -0.018859780832794715,
"calib/step_q_w": 0.6047774480712167,
"calib/step_q_w_n": 1011.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2577.0,
"completions/max_terminated_length": 2577.0,
"completions/mean_length": 799.64453125,
"completions/mean_terminated_length": 812.3373413085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.032,
"grad_norm": 0.4014255404472351,
"kl": 0.047332763671875,
"learning_rate": 4.75e-06,
"loss": -0.0524,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020229246467351913,
"mask/share_reasoning": 0.7831885814666748,
"mask/share_step_conf": 0.18095718324184418,
"num_tokens": 9333219.0,
"reward": 0.720932126045227,
"reward_std": 0.19922210276126862,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7739925384521484,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3280279040336609,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7201747894287109,
"adv/mean_abs_reasoning": 0.2836382985115051,
"adv/mean_abs_step_conf": 0.742740273475647,
"adv/ratio_final_to_reasoning": 2.5390604625965163,
"adv/ratio_step_to_reasoning": 2.6186177162021,
"adv/std_final_conf": 0.9259783029556274,
"adv/std_reasoning": 0.5726578831672668,
"adv/std_step_conf": 0.9356769919395447,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.1796875,
"calib/ece": 0.06511811023622044,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.001478797638217788,
"calib/mean_conf": 0.6463779527559056,
"calib/mu_c": 0.6469135802469136,
"calib/mu_w": 0.6454347826086958,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03685039370078735,
"calib/std_conf": 0.06318538268290515,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5829363024339721,
"calib/step_q_c_n": 1931.0,
"calib/step_q_gap": -0.00392132182887539,
"calib/step_q_w": 0.5868576242628475,
"calib/step_q_w_n": 1187.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2250.0,
"completions/max_terminated_length": 2250.0,
"completions/mean_length": 754.67578125,
"completions/mean_terminated_length": 763.62451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 258.0,
"epoch": 0.03306666666666667,
"grad_norm": 2.021209716796875,
"kl": 0.055149078369140625,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0723,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021880831569433212,
"mask/share_reasoning": 0.7856607437133789,
"mask/share_step_conf": 0.18073971569538116,
"num_tokens": 9632328.0,
"reward": 0.6749727725982666,
"reward_std": 0.17479467391967773,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7596234083175659,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.26532211899757385,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7195700407028198,
"adv/mean_abs_reasoning": 0.30481335520744324,
"adv/mean_abs_step_conf": 0.7410525679588318,
"adv/ratio_final_to_reasoning": 2.3606906600700306,
"adv/ratio_step_to_reasoning": 2.4311683044678354,
"adv/std_final_conf": 0.9117735624313354,
"adv/std_reasoning": 0.5960460901260376,
"adv/std_step_conf": 0.9353088140487671,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.48828125,
"calib/ece": 0.04129921259842524,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.017254580077483705,
"calib/mean_conf": 0.6309842519685039,
"calib/mu_c": 0.6243949044585987,
"calib/mu_w": 0.6416494845360824,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.027086614173228385,
"calib/std_conf": 0.038354641077149774,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5667146282973621,
"calib/step_q_c_n": 1668.0,
"calib/step_q_gap": -0.020873745622512208,
"calib/step_q_w": 0.5875883739198743,
"calib/step_q_w_n": 1273.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1552.0,
"completions/max_terminated_length": 1552.0,
"completions/mean_length": 668.54296875,
"completions/mean_terminated_length": 676.4703979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.29071611166000366,
"kl": 0.06232452392578125,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0108,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02311065047979355,
"mask/share_reasoning": 0.7838659882545471,
"mask/share_step_conf": 0.18130461871623993,
"num_tokens": 9910179.0,
"reward": 0.6577374935150146,
"reward_std": 0.17762039601802826,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7482753992080688,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.2461058646440506,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7352038621902466,
"adv/mean_abs_reasoning": 0.2771219313144684,
"adv/mean_abs_step_conf": 0.7629679441452026,
"adv/ratio_final_to_reasoning": 2.6529977569908123,
"adv/ratio_step_to_reasoning": 2.7531849988423076,
"adv/std_final_conf": 0.9094998240470886,
"adv/std_reasoning": 0.5483477115631104,
"adv/std_step_conf": 0.9357249736785889,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.3984375,
"calib/ece": 0.042862745098039265,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0034060534591194846,
"calib/mean_conf": 0.6263137254901961,
"calib/mu_c": 0.6250314465408805,
"calib/mu_w": 0.6284375,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02282352941176473,
"calib/std_conf": 0.04399353348029126,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5658475110270952,
"calib/step_q_c_n": 1587.0,
"calib/step_q_gap": -0.011175744786858233,
"calib/step_q_w": 0.5770232558139534,
"calib/step_q_w_n": 1075.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 651.59765625,
"completions/mean_terminated_length": 656.7283325195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.0352,
"grad_norm": 0.2806883156299591,
"kl": 0.0635223388671875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.018,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.025103997439146042,
"mask/share_reasoning": 0.7836605310440063,
"mask/share_step_conf": 0.18342293798923492,
"num_tokens": 10183860.0,
"reward": 0.691365122795105,
"reward_std": 0.17519938945770264,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7587417960166931,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3005508780479431,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7703639268875122,
"adv/mean_abs_reasoning": 0.40508854389190674,
"adv/mean_abs_step_conf": 0.7641962766647339,
"adv/ratio_final_to_reasoning": 1.9017173862440184,
"adv/ratio_step_to_reasoning": 1.8864919489519085,
"adv/std_final_conf": 0.9283607602119446,
"adv/std_reasoning": 0.6612812876701355,
"adv/std_step_conf": 0.9356377124786377,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.73046875,
"calib/ece": 0.11338582677165357,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0071595394736843065,
"calib/mean_conf": 0.635511811023622,
"calib/mu_c": 0.6373157894736843,
"calib/mu_w": 0.63015625,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.00043307086614173264,
"calib/std_conf": 0.05153866618829232,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5749976862563627,
"calib/step_q_c_n": 2161.0,
"calib/step_q_gap": -0.014384736546487642,
"calib/step_q_w": 0.5893824228028504,
"calib/step_q_w_n": 842.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2060.0,
"completions/max_terminated_length": 2060.0,
"completions/mean_length": 647.30078125,
"completions/mean_terminated_length": 654.976318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.36007630825042725,
"kl": 0.0668487548828125,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0003,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02514474466443062,
"mask/share_reasoning": 0.7692458033561707,
"mask/share_step_conf": 0.19389072060585022,
"num_tokens": 10454681.0,
"reward": 0.7932491302490234,
"reward_std": 0.20490731298923492,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.792660117149353,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.4469631314277649,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7443052530288696,
"adv/mean_abs_reasoning": 0.3177647888660431,
"adv/mean_abs_step_conf": 0.7797359228134155,
"adv/ratio_final_to_reasoning": 2.3423150679625455,
"adv/ratio_step_to_reasoning": 2.4538147401288093,
"adv/std_final_conf": 0.9288173317909241,
"adv/std_reasoning": 0.618408203125,
"adv/std_step_conf": 0.9358792901039124,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.7578125,
"calib/ece": 0.08882352941176469,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.003586220629231196,
"calib/mean_conf": 0.6497647058823528,
"calib/mu_c": 0.6484567901234568,
"calib/mu_w": 0.652043010752688,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05164705882352938,
"calib/std_conf": 0.05471444166647668,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5834260178748759,
"calib/step_q_c_n": 2014.0,
"calib/step_q_gap": -0.002851937396689541,
"calib/step_q_w": 0.5862779552715655,
"calib/step_q_w_n": 1252.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2131.0,
"completions/max_terminated_length": 2131.0,
"completions/mean_length": 806.93359375,
"completions/mean_terminated_length": 813.2874145507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 269.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.2525840699672699,
"kl": 0.05096435546875,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0398,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020500820130109787,
"mask/share_reasoning": 0.7965471148490906,
"mask/share_step_conf": 0.17513957619667053,
"num_tokens": 10770512.0,
"reward": 0.6974085569381714,
"reward_std": 0.17602644860744476,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7604573965072632,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3085784614086151,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7312940359115601,
"adv/mean_abs_reasoning": 0.340577095746994,
"adv/mean_abs_step_conf": 0.721725344657898,
"adv/ratio_final_to_reasoning": 2.147220247760935,
"adv/ratio_step_to_reasoning": 2.119124725856636,
"adv/std_final_conf": 0.9102586507797241,
"adv/std_reasoning": 0.6185452938079834,
"adv/std_step_conf": 0.9327031970024109,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.02734375,
"calib/ece": 0.19439215686274505,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0021269841269840883,
"calib/mean_conf": 0.6291372549019607,
"calib/mu_c": 0.6287619047619049,
"calib/mu_w": 0.6308888888888889,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05036378235256556,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.571522506619594,
"calib/step_q_c_n": 2266.0,
"calib/step_q_gap": -0.013935303075199523,
"calib/step_q_w": 0.5854578096947936,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1800.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 614.36328125,
"completions/mean_terminated_length": 619.2008056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.0384,
"grad_norm": 0.6964386701583862,
"kl": 0.0863037109375,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0322,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02720923349261284,
"mask/share_reasoning": 0.7675433158874512,
"mask/share_step_conf": 0.1974349170923233,
"num_tokens": 11030501.0,
"reward": 0.8428558111190796,
"reward_std": 0.20296572148799896,
"rewards/accuracy_reward_step": 0.8203125,
"rewards/final_brier_reward_step": 0.8105496168136597,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.51188063621521,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.6925872564315796,
"adv/mean_abs_reasoning": 0.2421915978193283,
"adv/mean_abs_step_conf": 0.7570322751998901,
"adv/ratio_final_to_reasoning": 2.859666737688565,
"adv/ratio_step_to_reasoning": 3.1257577967862704,
"adv/std_final_conf": 0.9100953936576843,
"adv/std_reasoning": 0.5482887625694275,
"adv/std_step_conf": 0.93547523021698,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 11.7265625,
"calib/ece": 0.16091269841269842,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.011904761904761904,
"calib/gap": -0.03623740753786542,
"calib/mean_conf": 0.6321031746031744,
"calib/mu_c": 0.6198802395209582,
"calib/mu_w": 0.6561176470588236,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06515873015873017,
"calib/std_conf": 0.06378863029569816,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5700119904076738,
"calib/step_q_c_n": 1668.0,
"calib/step_q_gap": -0.03401349684869803,
"calib/step_q_w": 0.6040254872563718,
"calib/step_q_w_n": 1334.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2427.0,
"completions/max_terminated_length": 2427.0,
"completions/mean_length": 673.62109375,
"completions/mean_terminated_length": 681.6087036132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.228117436170578,
"kl": 0.06999969482421875,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0816,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.025216789916157722,
"mask/share_reasoning": 0.7774437069892883,
"mask/share_step_conf": 0.1856207549571991,
"num_tokens": 11310044.0,
"reward": 0.7160718441009521,
"reward_std": 0.171960711479187,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.743464469909668,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3613356053829193,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.75675368309021,
"adv/mean_abs_reasoning": 0.33615952730178833,
"adv/mean_abs_step_conf": 0.7887274026870728,
"adv/ratio_final_to_reasoning": 2.2511742837228343,
"adv/ratio_step_to_reasoning": 2.346289004562379,
"adv/std_final_conf": 0.9278170466423035,
"adv/std_reasoning": 0.6186119318008423,
"adv/std_step_conf": 0.9357684254646301,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.66015625,
"calib/ece": 0.12379446640316209,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03058963871847309,
"calib/mean_conf": 0.646403162055336,
"calib/mu_c": 0.635521472392638,
"calib/mu_w": 0.6661111111111111,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06296442687747035,
"calib/std_conf": 0.05874937986285626,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5749078726968174,
"calib/step_q_c_n": 1791.0,
"calib/step_q_gap": -0.03290592040663087,
"calib/step_q_w": 0.6078137931034483,
"calib/step_q_w_n": 1450.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 706.0234375,
"completions/mean_terminated_length": 714.395263671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 248.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.322736531496048,
"kl": 0.07154083251953125,
"learning_rate": 4.527777777777778e-06,
"loss": -0.033,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02300211228430271,
"mask/share_reasoning": 0.7782649993896484,
"mask/share_step_conf": 0.18701410293579102,
"num_tokens": 11597674.0,
"reward": 0.6569255590438843,
"reward_std": 0.20873770117759705,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7445077896118164,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.2443433701992035,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7578836679458618,
"adv/mean_abs_reasoning": 0.31737250089645386,
"adv/mean_abs_step_conf": 0.7529335021972656,
"adv/ratio_final_to_reasoning": 2.3879941261613253,
"adv/ratio_step_to_reasoning": 2.3723967894840334,
"adv/std_final_conf": 0.9278097152709961,
"adv/std_reasoning": 0.5960695743560791,
"adv/std_step_conf": 0.9358320236206055,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.0390625,
"calib/ece": 0.10490196078431367,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.03607945835462456,
"calib/mean_conf": 0.6428627450980392,
"calib/mu_c": 0.6282894736842105,
"calib/mu_w": 0.6643689320388351,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0758431372549019,
"calib/std_conf": 0.05763339840814769,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.569218958611482,
"calib/step_q_c_n": 1498.0,
"calib/step_q_gap": -0.03725200098447767,
"calib/step_q_w": 0.6064709595959596,
"calib/step_q_w_n": 1584.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2438.0,
"completions/max_terminated_length": 2438.0,
"completions/mean_length": 697.52734375,
"completions/mean_terminated_length": 703.0196533203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 230.0,
"epoch": 0.0416,
"grad_norm": 0.438943475484848,
"kl": 0.07904052734375,
"learning_rate": 4.5e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.024240538477897644,
"mask/share_reasoning": 0.778479814529419,
"mask/share_step_conf": 0.1894671469926834,
"num_tokens": 11882329.0,
"reward": 0.6734088659286499,
"reward_std": 0.19206441938877106,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7334707379341125,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.2953784167766571,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7486369609832764,
"adv/mean_abs_reasoning": 0.34529808163642883,
"adv/mean_abs_step_conf": 0.7777887582778931,
"adv/ratio_final_to_reasoning": 2.1680889665976513,
"adv/ratio_step_to_reasoning": 2.2525139861531063,
"adv/std_final_conf": 0.930046021938324,
"adv/std_reasoning": 0.6402245163917542,
"adv/std_step_conf": 0.9359531402587891,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.23046875,
"calib/ece": 0.08637795275590554,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.028257229832572506,
"calib/mean_conf": 0.6534645669291338,
"calib/mu_c": 0.6654794520547946,
"calib/mu_w": 0.6372222222222221,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08251968503937011,
"calib/std_conf": 0.06059908204335681,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5963791423001951,
"calib/step_q_c_n": 2052.0,
"calib/step_q_gap": 0.010341689116674546,
"calib/step_q_w": 0.5860374531835205,
"calib/step_q_w_n": 1335.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1857.0,
"completions/max_terminated_length": 1857.0,
"completions/mean_length": 745.60546875,
"completions/mean_terminated_length": 754.4466552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.7285353541374207,
"kl": 0.07080078125,
"learning_rate": 4.472222222222223e-06,
"loss": -0.071,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.023201102390885353,
"mask/share_reasoning": 0.775591254234314,
"mask/share_step_conf": 0.18948885798454285,
"num_tokens": 12179964.0,
"reward": 0.6214379072189331,
"reward_std": 0.20891620218753815,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.753614068031311,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.17676173150539398,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7398024797439575,
"adv/mean_abs_reasoning": 0.25526443123817444,
"adv/mean_abs_step_conf": 0.7453057765960693,
"adv/ratio_final_to_reasoning": 2.898180824314238,
"adv/ratio_step_to_reasoning": 2.9197400240249762,
"adv/std_final_conf": 0.9244346618652344,
"adv/std_reasoning": 0.548233687877655,
"adv/std_step_conf": 0.9355214834213257,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.59765625,
"calib/ece": 0.21854901960784326,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.029637999122422154,
"calib/mean_conf": 0.6309411764705882,
"calib/mu_c": 0.625943396226415,
"calib/mu_w": 0.6555813953488372,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.009058823529411762,
"calib/std_conf": 0.047994040383587296,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.56872424375274,
"calib/step_q_c_n": 2281.0,
"calib/step_q_gap": -0.03498215159609719,
"calib/step_q_w": 0.6037063953488372,
"calib/step_q_w_n": 688.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1644.0,
"completions/max_terminated_length": 1644.0,
"completions/mean_length": 646.95703125,
"completions/mean_terminated_length": 652.0512084960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.22850604355335236,
"kl": 0.07393646240234375,
"learning_rate": 4.444444444444444e-06,
"loss": -0.015,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.026456307619810104,
"mask/share_reasoning": 0.7719156742095947,
"mask/share_step_conf": 0.19381554424762726,
"num_tokens": 12452833.0,
"reward": 0.8365099430084229,
"reward_std": 0.1689138412475586,
"rewards/accuracy_reward_step": 0.828125,
"rewards/final_brier_reward_step": 0.8058613538742065,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.5023148059844971,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.6996697187423706,
"adv/mean_abs_reasoning": 0.18548455834388733,
"adv/mean_abs_step_conf": 0.740502119064331,
"adv/ratio_final_to_reasoning": 3.7721184177778664,
"adv/ratio_step_to_reasoning": 3.992257499362531,
"adv/std_final_conf": 0.9113990068435669,
"adv/std_reasoning": 0.49572163820266724,
"adv/std_step_conf": 0.9348853826522827,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 10.6875,
"calib/ece": 0.10398437499999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0032569470879328932,
"calib/mean_conf": 0.626015625,
"calib/mu_c": 0.6269189189189188,
"calib/mu_w": 0.6236619718309859,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.003671875000000004,
"calib/std_conf": 0.04343349520657273,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.568086513994911,
"calib/step_q_c_n": 1965.0,
"calib/step_q_gap": 5.797962396414391e-05,
"calib/step_q_w": 0.5680285343709468,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1685.0,
"completions/max_terminated_length": 1685.0,
"completions/mean_length": 572.265625,
"completions/mean_terminated_length": 576.7716674804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.0448,
"grad_norm": 0.22524091601371765,
"kl": 0.0933380126953125,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0403,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.026845600455999374,
"mask/share_reasoning": 0.76666259765625,
"mask/share_step_conf": 0.19867932796478271,
"num_tokens": 12703701.0,
"reward": 0.7134827375411987,
"reward_std": 0.12991458177566528,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7896554470062256,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.29277873039245605,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.754668116569519,
"adv/mean_abs_reasoning": 0.4721015691757202,
"adv/mean_abs_step_conf": 0.7553716897964478,
"adv/ratio_final_to_reasoning": 1.598529142546919,
"adv/ratio_step_to_reasoning": 1.6000194430942296,
"adv/std_final_conf": 0.9316073656082153,
"adv/std_reasoning": 0.7391892075538635,
"adv/std_step_conf": 0.935806930065155,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.58984375,
"calib/ece": 0.10503906249999993,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.008032736962314546,
"calib/mean_conf": 0.6443359375,
"calib/mu_c": 0.642108108108108,
"calib/mu_w": 0.6501408450704226,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.013359374999999968,
"calib/std_conf": 0.054489359245600374,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5799176062445793,
"calib/step_q_c_n": 2306.0,
"calib/step_q_gap": -0.009733429742334598,
"calib/step_q_w": 0.5896510359869139,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1576.0,
"completions/max_terminated_length": 1576.0,
"completions/mean_length": 697.98046875,
"completions/mean_terminated_length": 703.4763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.625836193561554,
"kl": 0.0879974365234375,
"learning_rate": 4.388888888888889e-06,
"loss": -0.007,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.024772757664322853,
"mask/share_reasoning": 0.7765905857086182,
"mask/share_step_conf": 0.1908242106437683,
"num_tokens": 12987608.0,
"reward": 0.7277460098266602,
"reward_std": 0.20869551599025726,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7872527241706848,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.32370805740356445,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7820515036582947,
"adv/mean_abs_reasoning": 0.369723379611969,
"adv/mean_abs_step_conf": 0.770028293132782,
"adv/ratio_final_to_reasoning": 2.1152341095634015,
"adv/ratio_step_to_reasoning": 2.0827146336835387,
"adv/std_final_conf": 0.929624080657959,
"adv/std_reasoning": 0.6402297019958496,
"adv/std_step_conf": 0.9356967210769653,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.5078125,
"calib/ece": 0.09227450980392164,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.006852781815778863,
"calib/mean_conf": 0.6474117647058825,
"calib/mu_c": 0.6499378881987576,
"calib/mu_w": 0.6430851063829788,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.054156862745098115,
"calib/std_conf": 0.056564790708233025,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5848204623708805,
"calib/step_q_c_n": 2033.0,
"calib/step_q_gap": 0.009747750651462161,
"calib/step_q_w": 0.5750727117194183,
"calib/step_q_w_n": 1169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2021.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 721.84765625,
"completions/mean_terminated_length": 727.531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.23728294670581818,
"kl": 0.076202392578125,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0242,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02301742322742939,
"mask/share_reasoning": 0.7762603759765625,
"mask/share_step_conf": 0.19290973246097565,
"num_tokens": 13278721.0,
"reward": 0.6986934542655945,
"reward_std": 0.18942584097385406,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7639957070350647,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.308391273021698,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7148399353027344,
"adv/mean_abs_reasoning": 0.35291802883148193,
"adv/mean_abs_step_conf": 0.7707694172859192,
"adv/ratio_final_to_reasoning": 2.0255126598932405,
"adv/ratio_step_to_reasoning": 2.183989919239748,
"adv/std_final_conf": 0.9121301770210266,
"adv/std_reasoning": 0.6401578187942505,
"adv/std_step_conf": 0.9362253546714783,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.640625,
"calib/ece": 0.05878906250000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": 0.025874628165130864,
"calib/mean_conf": 0.6404296875,
"calib/mu_c": 0.6482122905027934,
"calib/mu_w": 0.6223376623376625,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05550382638748741,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5797810898928738,
"calib/step_q_c_n": 2147.0,
"calib/step_q_gap": 0.01256620393849206,
"calib/step_q_w": 0.5672148859543817,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1737.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 679.6484375,
"completions/mean_terminated_length": 685.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.048,
"grad_norm": 8878.478515625,
"kl": 136.07752990722656,
"learning_rate": 4.333333333333334e-06,
"loss": 1.7789,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.024658601731061935,
"mask/share_reasoning": 0.7740293741226196,
"mask/share_step_conf": 0.19349952042102814,
"num_tokens": 13557759.0,
"reward": 0.6431659460067749,
"reward_std": 0.2364698052406311,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7940347194671631,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.1524534672498703,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7531983256340027,
"adv/mean_abs_reasoning": 0.4143233895301819,
"adv/mean_abs_step_conf": 0.7525116205215454,
"adv/ratio_final_to_reasoning": 1.817899603708313,
"adv/ratio_step_to_reasoning": 1.8162421903693367,
"adv/std_final_conf": 0.927490234375,
"adv/std_reasoning": 0.6816543936729431,
"adv/std_step_conf": 0.9355647563934326,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 12.65625,
"calib/ece": 0.08423886639676122,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.012145748987854251,
"calib/gap": -0.028574203914694674,
"calib/mean_conf": 0.6410242914979758,
"calib/mu_c": 0.6313067484662577,
"calib/mu_w": 0.6598809523809523,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03267206477732801,
"calib/std_conf": 0.08169286447591004,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5812162309368192,
"calib/step_q_c_n": 1836.0,
"calib/step_q_gap": -0.04324245852187025,
"calib/step_q_w": 0.6244586894586894,
"calib/step_q_w_n": 1404.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3002.0,
"completions/max_terminated_length": 3002.0,
"completions/mean_length": 683.46875,
"completions/mean_terminated_length": 708.3724975585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.28518831729888916,
"kl": 0.08718109130859375,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0314,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.025916893035173416,
"mask/share_reasoning": 0.7426183819770813,
"mask/share_step_conf": 0.1963084638118744,
"num_tokens": 13837495.0,
"reward": 0.6834006309509277,
"reward_std": 0.19034671783447266,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.729149580001831,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.3173391819000244,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7467596530914307,
"adv/mean_abs_reasoning": 0.1418902724981308,
"adv/mean_abs_step_conf": 0.7821815609931946,
"adv/ratio_final_to_reasoning": 5.262937620345102,
"adv/ratio_step_to_reasoning": 5.512580582319332,
"adv/std_final_conf": 0.9253977537155151,
"adv/std_reasoning": 0.4049662947654724,
"adv/std_step_conf": 0.9351449608802795,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.69921875,
"calib/ece": 0.18429133858267724,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.039278743855015064,
"calib/mean_conf": 0.6492519685039368,
"calib/mu_c": 0.6373446327683615,
"calib/mu_w": 0.6766233766233766,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06834645669291337,
"calib/std_conf": 0.05385010816765645,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5702151639344262,
"calib/step_q_c_n": 1952.0,
"calib/step_q_gap": -0.03858391227804503,
"calib/step_q_w": 0.6087990762124712,
"calib/step_q_w_n": 1299.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2169.0,
"completions/max_terminated_length": 2169.0,
"completions/mean_length": 742.6328125,
"completions/mean_terminated_length": 751.4387817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.12521328032016754,
"kl": 0.075958251953125,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0114,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021606799215078354,
"mask/share_reasoning": 0.7760756015777588,
"mask/share_step_conf": 0.19059887528419495,
"num_tokens": 14133585.0,
"reward": 0.6950026154518127,
"reward_std": 0.12959975004196167,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7609972357749939,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.29228928685188293,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7752068042755127,
"adv/mean_abs_reasoning": 0.34300339221954346,
"adv/mean_abs_step_conf": 0.7662767171859741,
"adv/ratio_final_to_reasoning": 2.260055794956489,
"adv/ratio_step_to_reasoning": 2.234020813110529,
"adv/std_final_conf": 0.929909348487854,
"adv/std_reasoning": 0.618562638759613,
"adv/std_step_conf": 0.9360288381576538,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.26953125,
"calib/ece": 0.11050980392156864,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.017300155082475843,
"calib/mean_conf": 0.631921568627451,
"calib/mu_c": 0.6263583815028901,
"calib/mu_w": 0.643658536585366,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.032,
"calib/std_conf": 0.05036176698657422,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5689640591966173,
"calib/step_q_c_n": 1892.0,
"calib/step_q_gap": -0.01274792469059316,
"calib/step_q_w": 0.5817119838872105,
"calib/step_q_w_n": 993.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1904.0,
"completions/max_terminated_length": 1904.0,
"completions/mean_length": 650.8046875,
"completions/mean_terminated_length": 655.9291381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.0512,
"grad_norm": 0.2294154167175293,
"kl": 0.092254638671875,
"learning_rate": 4.25e-06,
"loss": -0.0293,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.026431187987327576,
"mask/share_reasoning": 0.7731444239616394,
"mask/share_step_conf": 0.19261188805103302,
"num_tokens": 14403879.0,
"reward": 0.7620805501937866,
"reward_std": 0.21728140115737915,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7665836215019226,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.4232025742530823,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7520368099212646,
"adv/mean_abs_reasoning": 0.3485822379589081,
"adv/mean_abs_step_conf": 0.752032995223999,
"adv/ratio_final_to_reasoning": 2.1574157487907257,
"adv/ratio_step_to_reasoning": 2.1574048053264576,
"adv/std_final_conf": 0.9283875823020935,
"adv/std_reasoning": 0.6185300946235657,
"adv/std_step_conf": 0.9361112713813782,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 10.546875,
"calib/ece": 0.09782608695652174,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0036701964395333686,
"calib/mean_conf": 0.6333201581027669,
"calib/mu_c": 0.6343646408839778,
"calib/mu_w": 0.6306944444444444,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.007865612648221346,
"calib/std_conf": 0.041082730914846016,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5679769994772609,
"calib/step_q_c_n": 1913.0,
"calib/step_q_gap": -0.0016163931529805664,
"calib/step_q_w": 0.5695933926302414,
"calib/step_q_w_n": 787.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 644.6171875,
"completions/mean_terminated_length": 654.8492431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 261.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.2470749318599701,
"kl": 0.08589935302734375,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0081,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.024077853187918663,
"mask/share_reasoning": 0.7702655792236328,
"mask/share_step_conf": 0.19003157317638397,
"num_tokens": 14673437.0,
"reward": 0.701884388923645,
"reward_std": 0.2006235271692276,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7802191376686096,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.2844870388507843,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7255159616470337,
"adv/mean_abs_reasoning": 0.3110960125923157,
"adv/mean_abs_step_conf": 0.7348172068595886,
"adv/ratio_final_to_reasoning": 2.3321287714407513,
"adv/ratio_step_to_reasoning": 2.36202708204605,
"adv/std_final_conf": 0.9274781346321106,
"adv/std_reasoning": 0.6185328960418701,
"adv/std_step_conf": 0.9359754920005798,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 11.80859375,
"calib/ece": 0.12241106719367606,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.007905138339920948,
"calib/gap": 0.005098189890710647,
"calib/mean_conf": 0.6428853754940712,
"calib/mu_c": 0.6441145833333334,
"calib/mu_w": 0.6390163934426227,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.003201581027667973,
"calib/std_conf": 0.06874923430085506,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5843965517241381,
"calib/step_q_c_n": 2204.0,
"calib/step_q_gap": -0.0059819586543722325,
"calib/step_q_w": 0.5903785103785103,
"calib/step_q_w_n": 819.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2336.0,
"completions/max_terminated_length": 2336.0,
"completions/mean_length": 699.2890625,
"completions/mean_terminated_length": 704.7952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.2760617136955261,
"kl": 0.08426666259765625,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0539,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02490140125155449,
"mask/share_reasoning": 0.7720136046409607,
"mask/share_step_conf": 0.19527247548103333,
"num_tokens": 14957815.0,
"reward": 0.7315381169319153,
"reward_std": 0.17188885807991028,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7913238406181335,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.324096143245697,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.6964578628540039,
"adv/mean_abs_reasoning": 0.3133673369884491,
"adv/mean_abs_step_conf": 0.7605372667312622,
"adv/ratio_final_to_reasoning": 2.222496669714099,
"adv/ratio_step_to_reasoning": 2.4269832141417345,
"adv/std_final_conf": 0.9126064777374268,
"adv/std_reasoning": 0.6184675693511963,
"adv/std_step_conf": 0.9359240531921387,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.35546875,
"calib/ece": 0.18976377952755905,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0368697614442296,
"calib/mean_conf": 0.6375590551181102,
"calib/mu_c": 0.6279787234042552,
"calib/mu_w": 0.6648484848484848,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04358267716535434,
"calib/std_conf": 0.055033512130109555,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5662160751565762,
"calib/step_q_c_n": 1916.0,
"calib/step_q_gap": -0.03752761808257621,
"calib/step_q_w": 0.6037436932391524,
"calib/step_q_w_n": 991.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2833.0,
"completions/max_terminated_length": 2833.0,
"completions/mean_length": 722.0390625,
"completions/mean_terminated_length": 724.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.0544,
"grad_norm": 0.33014073967933655,
"kl": 0.07884979248046875,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0451,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.023661799728870392,
"mask/share_reasoning": 0.7851213216781616,
"mask/share_step_conf": 0.18731063604354858,
"num_tokens": 15251953.0,
"reward": 0.7580503821372986,
"reward_std": 0.19549910724163055,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7738453149795532,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.39694297313690186,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7437102794647217,
"adv/mean_abs_reasoning": 0.19842737913131714,
"adv/mean_abs_step_conf": 0.7377064824104309,
"adv/ratio_final_to_reasoning": 3.74802248923795,
"adv/ratio_step_to_reasoning": 3.7177655908171046,
"adv/std_final_conf": 0.927570104598999,
"adv/std_reasoning": 0.4959178566932678,
"adv/std_step_conf": 0.936040997505188,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.62890625,
"calib/ece": 0.17385826771653545,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": -0.023765092243794084,
"calib/mean_conf": 0.6466929133858267,
"calib/mu_c": 0.6419211822660099,
"calib/mu_w": 0.665686274509804,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010669291338582677,
"calib/std_conf": 0.06702984548692002,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5801745747538049,
"calib/step_q_c_n": 2234.0,
"calib/step_q_gap": -0.019771589445387594,
"calib/step_q_w": 0.5999461641991924,
"calib/step_q_w_n": 743.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2463.0,
"completions/max_terminated_length": 2463.0,
"completions/mean_length": 739.28515625,
"completions/mean_terminated_length": 745.1063232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.4081308841705322,
"kl": 0.08150482177734375,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0239,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.024338986724615097,
"mask/share_reasoning": 0.7771621346473694,
"mask/share_step_conf": 0.1906864047050476,
"num_tokens": 15549162.0,
"reward": 0.7791553139686584,
"reward_std": 0.1678694188594818,
"rewards/accuracy_reward_step": 0.79296875,
"rewards/final_brier_reward_step": 0.797863245010376,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.4034160375595093,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7358145117759705,
"adv/mean_abs_reasoning": 0.34003329277038574,
"adv/mean_abs_step_conf": 0.7458301782608032,
"adv/ratio_final_to_reasoning": 2.16394843511057,
"adv/ratio_step_to_reasoning": 2.1934033934860606,
"adv/std_final_conf": 0.9298948049545288,
"adv/std_reasoning": 0.6401073932647705,
"adv/std_step_conf": 0.9360513687133789,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.49609375,
"calib/ece": 0.14941406250000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.004433285509325646,
"calib/mean_conf": 0.6552734375,
"calib/mu_c": 0.654390243902439,
"calib/mu_w": 0.6588235294117647,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.001953125000000001,
"calib/std_conf": 0.05525441368735708,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.58257016840417,
"calib/step_q_c_n": 2494.0,
"calib/step_q_gap": -0.016536214574553454,
"calib/step_q_w": 0.5991063829787234,
"calib/step_q_w_n": 705.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 771.90234375,
"completions/mean_terminated_length": 777.9802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.21992897987365723,
"kl": 0.0755157470703125,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0655,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020725980401039124,
"mask/share_reasoning": 0.7839224338531494,
"mask/share_step_conf": 0.18753905594348907,
"num_tokens": 15852593.0,
"reward": 0.7674970626831055,
"reward_std": 0.20992466807365417,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8148292899131775,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.3600085973739624,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.7923758029937744,
"adv/mean_abs_reasoning": 0.2119789868593216,
"adv/mean_abs_step_conf": 0.7696312665939331,
"adv/ratio_final_to_reasoning": 3.737992216745659,
"adv/ratio_step_to_reasoning": 3.6306960326435265,
"adv/std_final_conf": 0.9255867004394531,
"adv/std_reasoning": 0.467656672000885,
"adv/std_step_conf": 0.9360727667808533,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 10.70703125,
"calib/ece": 0.21914453125000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.013305408015451659,
"calib/mean_conf": 0.6339882812500001,
"calib/mu_c": 0.6359633027522935,
"calib/mu_w": 0.6226578947368419,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0007851562499999982,
"calib/std_conf": 0.06745568466720131,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.577243031358885,
"calib/step_q_c_n": 2296.0,
"calib/step_q_gap": -0.016914272011901432,
"calib/step_q_w": 0.5941573033707864,
"calib/step_q_w_n": 445.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2180.0,
"completions/max_terminated_length": 2180.0,
"completions/mean_length": 667.765625,
"completions/mean_terminated_length": 673.0236206054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.0576,
"grad_norm": 0.3071500360965729,
"kl": 0.1081085205078125,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0084,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02563382312655449,
"mask/share_reasoning": 0.777256965637207,
"mask/share_step_conf": 0.189296692609787,
"num_tokens": 16129773.0,
"reward": 0.8549725413322449,
"reward_std": 0.18176884949207306,
"rewards/accuracy_reward_step": 0.8515625,
"rewards/final_brier_reward_step": 0.8250710964202881,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.5145614743232727,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.7537381649017334,
"adv/mean_abs_reasoning": 0.3317815065383911,
"adv/mean_abs_step_conf": 0.7744541168212891,
"adv/ratio_final_to_reasoning": 2.271790772083063,
"adv/ratio_step_to_reasoning": 2.3342293092266595,
"adv/std_final_conf": 0.9304205775260925,
"adv/std_reasoning": 0.6186366081237793,
"adv/std_step_conf": 0.9362649321556091,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.35546875,
"calib/ece": 0.14776892430278885,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.04838398714687364,
"calib/mean_conf": 0.6510358565737052,
"calib/mu_c": 0.6323376623376623,
"calib/mu_w": 0.680721649484536,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09262948207171313,
"calib/std_conf": 0.05757830895843759,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5705675146771038,
"calib/step_q_c_n": 1533.0,
"calib/step_q_gap": -0.04119935648854045,
"calib/step_q_w": 0.6117668711656442,
"calib/step_q_w_n": 1630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2965.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 745.17578125,
"completions/mean_terminated_length": 748.0980834960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.058666666666666666,
"grad_norm": 1.5653352737426758,
"kl": 0.0928497314453125,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0394,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.022620702162384987,
"mask/share_reasoning": 0.7736297845840454,
"mask/share_step_conf": 0.19984331727027893,
"num_tokens": 16428362.0,
"reward": 0.6656664609909058,
"reward_std": 0.23249629139900208,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7208675742149353,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.29405906796455383,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7665127515792847,
"adv/mean_abs_reasoning": 0.39497989416122437,
"adv/mean_abs_step_conf": 0.7779443264007568,
"adv/ratio_final_to_reasoning": 1.94063739170077,
"adv/ratio_step_to_reasoning": 1.9695795606325537,
"adv/std_final_conf": 0.9304113984107971,
"adv/std_reasoning": 0.6612225770950317,
"adv/std_step_conf": 0.9364283084869385,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.171875,
"calib/ece": 0.06500000000000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": 0.006969597907813041,
"calib/mean_conf": 0.64859375,
"calib/mu_c": 0.6511801242236025,
"calib/mu_w": 0.6442105263157895,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.042343750000000034,
"calib/std_conf": 0.06418034131209883,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5809301014656144,
"calib/step_q_c_n": 1774.0,
"calib/step_q_gap": 0.0017127902317285448,
"calib/step_q_w": 0.5792173112338859,
"calib/step_q_w_n": 1086.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1917.0,
"completions/max_terminated_length": 1917.0,
"completions/mean_length": 747.08984375,
"completions/mean_terminated_length": 752.972412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 263.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.21732014417648315,
"kl": 0.08220672607421875,
"learning_rate": 4.027777777777779e-06,
"loss": 0.0511,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022660259157419205,
"mask/share_reasoning": 0.7831286787986755,
"mask/share_step_conf": 0.18639856576919556,
"num_tokens": 16726457.0,
"reward": 0.707023024559021,
"reward_std": 0.22808465361595154,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7653632760047913,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.32290148735046387,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.7563657760620117,
"adv/mean_abs_reasoning": 0.25031182169914246,
"adv/mean_abs_step_conf": 0.7859717011451721,
"adv/ratio_final_to_reasoning": 3.02169418498784,
"adv/ratio_step_to_reasoning": 3.1399703610077827,
"adv/std_final_conf": 0.9285401701927185,
"adv/std_reasoning": 0.5483488440513611,
"adv/std_step_conf": 0.9359719157218933,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 12.4140625,
"calib/ece": 0.21637096774193545,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.004032258064516129,
"calib/gap": -0.03967926689576173,
"calib/mean_conf": 0.6545161290322581,
"calib/mu_c": 0.6458762886597939,
"calib/mu_w": 0.6855555555555556,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.04431451612903227,
"calib/std_conf": 0.05451923068099795,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5745685997171146,
"calib/step_q_c_n": 2121.0,
"calib/step_q_gap": -0.05970765383066223,
"calib/step_q_w": 0.6342762535477768,
"calib/step_q_w_n": 1057.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 727.50390625,
"completions/mean_terminated_length": 741.9960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 336.0,
"epoch": 0.0608,
"grad_norm": 0.1882893294095993,
"kl": 0.0820465087890625,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0913,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.021145198494195938,
"mask/share_reasoning": 0.762434720993042,
"mask/share_step_conf": 0.19688883423805237,
"num_tokens": 17019490.0,
"reward": 0.7441555261611938,
"reward_std": 0.19160419702529907,
"rewards/accuracy_reward_step": 0.7578125,
"rewards/final_brier_reward_step": 0.7693679332733154,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.37441182136535645,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7255205512046814,
"adv/mean_abs_reasoning": 0.3990098834037781,
"adv/mean_abs_step_conf": 0.7807788252830505,
"adv/ratio_final_to_reasoning": 1.8183022059894463,
"adv/ratio_step_to_reasoning": 1.9567906905527483,
"adv/std_final_conf": 0.9329466819763184,
"adv/std_reasoning": 0.7013241052627563,
"adv/std_step_conf": 0.9364590048789978,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.99609375,
"calib/ece": 0.0705179282868526,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0199203187250996,
"calib/gap": 0.015800388403384713,
"calib/mean_conf": 0.6770517928286852,
"calib/mu_c": 0.6826543209876543,
"calib/mu_w": 0.6668539325842696,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.051075697211155374,
"calib/std_conf": 0.07998053365487154,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6020530726256983,
"calib/step_q_c_n": 2148.0,
"calib/step_q_gap": -0.02250441866350028,
"calib/step_q_w": 0.6245574912891986,
"calib/step_q_w_n": 1435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2918.0,
"completions/max_terminated_length": 2918.0,
"completions/mean_length": 863.31640625,
"completions/mean_terminated_length": 877.0198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.19960562884807587,
"kl": 0.069580078125,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0834,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019993755966424942,
"mask/share_reasoning": 0.7693009972572327,
"mask/share_step_conf": 0.1950802505016327,
"num_tokens": 17346819.0,
"reward": 0.6891767978668213,
"reward_std": 0.24477124214172363,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7559226751327515,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.299774706363678,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.753639817237854,
"adv/mean_abs_reasoning": 0.40339523553848267,
"adv/mean_abs_step_conf": 0.7527267932891846,
"adv/ratio_final_to_reasoning": 1.8682417412090606,
"adv/ratio_step_to_reasoning": 1.8659783928394384,
"adv/std_final_conf": 0.9298037886619568,
"adv/std_reasoning": 0.6815541982650757,
"adv/std_step_conf": 0.9364940524101257,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.56640625,
"calib/ece": 0.04047244094488186,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007023617820719141,
"calib/mean_conf": 0.6483464566929135,
"calib/mu_c": 0.6458024691358025,
"calib/mu_w": 0.6528260869565217,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.025511811023622,
"calib/std_conf": 0.05279232495409867,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5748107798165137,
"calib/step_q_c_n": 1744.0,
"calib/step_q_gap": -0.01329111007666628,
"calib/step_q_w": 0.58810188989318,
"calib/step_q_w_n": 1217.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1761.0,
"completions/max_terminated_length": 1761.0,
"completions/mean_length": 761.578125,
"completions/mean_terminated_length": 767.5748291015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.5024107098579407,
"kl": 0.0820159912109375,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0103,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02243354171514511,
"mask/share_reasoning": 0.7792695760726929,
"mask/share_step_conf": 0.1904844045639038,
"num_tokens": 17648031.0,
"reward": 0.6598141193389893,
"reward_std": 0.2476826310157776,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7568843960762024,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.2377437800168991,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.7591273188591003,
"adv/mean_abs_reasoning": 0.5218755006790161,
"adv/mean_abs_step_conf": 0.775875449180603,
"adv/ratio_final_to_reasoning": 1.4546138262313408,
"adv/ratio_step_to_reasoning": 1.4867060212083258,
"adv/std_final_conf": 0.9318666458129883,
"adv/std_reasoning": 0.7754560112953186,
"adv/std_step_conf": 0.9364181756973267,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 11.859375,
"calib/ece": 0.09024896265560166,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.004149377593360996,
"calib/gap": -0.018300289766661537,
"calib/mean_conf": 0.6528215767634854,
"calib/mu_c": 0.6465189873417722,
"calib/mu_w": 0.6648192771084337,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.043734439834024905,
"calib/std_conf": 0.05413527423211244,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.57798463356974,
"calib/step_q_c_n": 1692.0,
"calib/step_q_gap": -0.02137548547787904,
"calib/step_q_w": 0.599360119047619,
"calib/step_q_w_n": 1344.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2933.0,
"completions/max_terminated_length": 2933.0,
"completions/mean_length": 780.3125,
"completions/mean_terminated_length": 792.698486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.064,
"grad_norm": 0.47881805896759033,
"kl": 0.07741546630859375,
"learning_rate": 3.916666666666667e-06,
"loss": -0.1225,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.020657043904066086,
"mask/share_reasoning": 0.7711943984031677,
"mask/share_step_conf": 0.1925235241651535,
"num_tokens": 17956647.0,
"reward": 0.619196891784668,
"reward_std": 0.2881484627723694,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7128839492797852,
"rewards/format_reward_step": 0.93359375,
"rewards/step_correlation_reward": 0.21379104256629944,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.7319515347480774,
"adv/mean_abs_reasoning": 0.2323523759841919,
"adv/mean_abs_step_conf": 0.759729266166687,
"adv/ratio_final_to_reasoning": 3.1501788249321616,
"adv/ratio_step_to_reasoning": 3.2697288458904126,
"adv/std_final_conf": 0.9271352291107178,
"adv/std_reasoning": 0.548259437084198,
"adv/std_step_conf": 0.9357583522796631,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 10.5859375,
"calib/ece": 0.14762096774193548,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.01094119893088974,
"calib/mean_conf": 0.6399596774193548,
"calib/mu_c": 0.6375773195876289,
"calib/mu_w": 0.6485185185185186,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.002661290322580646,
"calib/std_conf": 0.05387783361211411,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5723488943488944,
"calib/step_q_c_n": 2035.0,
"calib/step_q_gap": -0.018080735280735283,
"calib/step_q_w": 0.5904296296296296,
"calib/step_q_w_n": 675.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1609.0,
"completions/max_terminated_length": 1609.0,
"completions/mean_length": 677.70703125,
"completions/mean_terminated_length": 685.7431030273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.6927701234817505,
"kl": 0.0820159912109375,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0894,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.024425944313406944,
"mask/share_reasoning": 0.7670689821243286,
"mask/share_step_conf": 0.1967863142490387,
"num_tokens": 18234204.0,
"reward": 0.7487568259239197,
"reward_std": 0.17801451683044434,
"rewards/accuracy_reward_step": 0.7734375,
"rewards/final_brier_reward_step": 0.7777035236358643,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.3713727295398712,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.5855187773704529,
"adv/mean_abs_reasoning": 0.6621418595314026,
"adv/mean_abs_step_conf": 0.5837773680686951,
"adv/ratio_final_to_reasoning": 0.8842799604677224,
"adv/ratio_step_to_reasoning": 0.8816499963947816,
"adv/std_final_conf": 0.8104560971260071,
"adv/std_reasoning": 0.8591093420982361,
"adv/std_step_conf": 0.8111568093299866,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 11.80859375,
"calib/ece": 0.07259259259259261,
"calib/final_conf_rate": 0.52734375,
"calib/format_rate": 0.52734375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.011274001037882786,
"calib/mean_conf": 0.6237037037037039,
"calib/mu_c": 0.6271276595744681,
"calib/mu_w": 0.6158536585365854,
"calib/nonempty_final_conf_rate": 0.52734375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.039266246757625405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5861579892280072,
"calib/step_q_c_n": 2228.0,
"calib/step_q_gap": -0.006156476180797843,
"calib/step_q_w": 0.5923144654088051,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2486.0,
"completions/max_terminated_length": 2486.0,
"completions/mean_length": 800.28515625,
"completions/mean_terminated_length": 806.5866088867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.06613333333333334,
"grad_norm": 10.070162773132324,
"kl": 0.0822296142578125,
"learning_rate": 3.861111111111112e-06,
"loss": -0.4774,
"mask/has_final_conf_rate": 0.52734375,
"mask/share_final_conf": 0.014446118846535683,
"mask/share_reasoning": 0.7766348123550415,
"mask/share_step_conf": 0.20110660791397095,
"num_tokens": 18546157.0,
"reward": 0.4277394413948059,
"reward_std": 0.26725536584854126,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.41474997997283936,
"rewards/format_reward_step": 0.52734375,
"rewards/step_correlation_reward": 0.18838509917259216,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7331758737564087,
"adv/mean_abs_reasoning": 0.4314548075199127,
"adv/mean_abs_step_conf": 0.7121888399124146,
"adv/ratio_final_to_reasoning": 1.699310938197324,
"adv/ratio_step_to_reasoning": 1.6506684535657776,
"adv/std_final_conf": 0.9166691303253174,
"adv/std_reasoning": 0.7014430165290833,
"adv/std_step_conf": 0.9218115210533142,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.625,
"calib/ece": 0.08731818181818182,
"calib/final_conf_rate": 0.859375,
"calib/format_rate": 0.85546875,
"calib/frac_conf_gt_0.9": 0.004545454545454545,
"calib/gap": -0.014696969696969653,
"calib/mean_conf": 0.6498636363636363,
"calib/mu_c": 0.6454545454545455,
"calib/mu_w": 0.6601515151515152,
"calib/nonempty_final_conf_rate": 0.859375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.018590909090909064,
"calib/std_conf": 0.05784367291128537,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5940733197556007,
"calib/step_q_c_n": 1964.0,
"calib/step_q_gap": -0.0131191092664813,
"calib/step_q_w": 0.607192429022082,
"calib/step_q_w_n": 1268.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2722.0,
"completions/max_terminated_length": 2722.0,
"completions/mean_length": 888.6328125,
"completions/mean_terminated_length": 888.6328125,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.0672,
"grad_norm": 0.47227707505226135,
"kl": 0.07555389404296875,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0276,
"mask/has_final_conf_rate": 0.859375,
"mask/share_final_conf": 0.01991172507405281,
"mask/share_reasoning": 0.7896748781204224,
"mask/share_step_conf": 0.19041332602500916,
"num_tokens": 18882287.0,
"reward": 0.6191070675849915,
"reward_std": 0.24419625103473663,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6652539372444153,
"rewards/format_reward_step": 0.85546875,
"rewards/step_correlation_reward": 0.26905399560928345,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7193543314933777,
"adv/mean_abs_reasoning": 0.2724745273590088,
"adv/mean_abs_step_conf": 0.7900326251983643,
"adv/ratio_final_to_reasoning": 2.640079197368663,
"adv/ratio_step_to_reasoning": 2.899473330060787,
"adv/std_final_conf": 0.914580225944519,
"adv/std_reasoning": 0.5726332068443298,
"adv/std_step_conf": 0.9362464547157288,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.85546875,
"calib/ece": 0.1372047244094489,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": 0.0004552708157326002,
"calib/mean_conf": 0.6541338582677164,
"calib/mu_c": 0.654228855721393,
"calib/mu_w": 0.6537735849056604,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.06036501335931002,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5806550618415026,
"calib/step_q_c_n": 2183.0,
"calib/step_q_gap": -0.03269000858103266,
"calib/step_q_w": 0.6133450704225353,
"calib/step_q_w_n": 852.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2493.0,
"completions/max_terminated_length": 2493.0,
"completions/mean_length": 726.25390625,
"completions/mean_terminated_length": 734.8656616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.6560176014900208,
"kl": 0.08819580078125,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0339,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.022718653082847595,
"mask/share_reasoning": 0.7660689353942871,
"mask/share_step_conf": 0.1994936764240265,
"num_tokens": 19171984.0,
"reward": 0.7354456186294556,
"reward_std": 0.21538183093070984,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.8062112927436829,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.3092111051082611,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.7510883808135986,
"adv/mean_abs_reasoning": 0.25280725955963135,
"adv/mean_abs_step_conf": 0.7716500163078308,
"adv/ratio_final_to_reasoning": 2.9709921389201024,
"adv/ratio_step_to_reasoning": 3.052325386747118,
"adv/std_final_conf": 0.9271266460418701,
"adv/std_reasoning": 0.5481659770011902,
"adv/std_step_conf": 0.9363374710083008,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.94921875,
"calib/ece": 0.04152941176470584,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.010342105263158041,
"calib/mean_conf": 0.6436470588235293,
"calib/mu_c": 0.6475,
"calib/mu_w": 0.6371578947368419,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.028862745098039176,
"calib/std_conf": 0.05433766018958576,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5741564506713368,
"calib/step_q_c_n": 1713.0,
"calib/step_q_gap": -0.01048575116352557,
"calib/step_q_w": 0.5846422018348624,
"calib/step_q_w_n": 1090.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1725.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 679.70703125,
"completions/mean_terminated_length": 685.05908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 238.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.21009093523025513,
"kl": 0.0871734619140625,
"learning_rate": 3.777777777777778e-06,
"loss": -0.01,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.024044401943683624,
"mask/share_reasoning": 0.7692995071411133,
"mask/share_step_conf": 0.19884361326694489,
"num_tokens": 19451013.0,
"reward": 0.6475369334220886,
"reward_std": 0.20800790190696716,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.764864444732666,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.20599065721035004,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.7202534079551697,
"adv/mean_abs_reasoning": 0.4160011112689972,
"adv/mean_abs_step_conf": 0.7719686627388,
"adv/ratio_final_to_reasoning": 1.7313737594547793,
"adv/ratio_step_to_reasoning": 1.855688943675982,
"adv/std_final_conf": 0.9306468367576599,
"adv/std_reasoning": 0.7012670636177063,
"adv/std_step_conf": 0.936653196811676,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.515625,
"calib/ece": 0.18390438247011956,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.00398406374501992,
"calib/gap": 0.008322925958965444,
"calib/mean_conf": 0.6666135458167332,
"calib/mu_c": 0.6705263157894737,
"calib/mu_w": 0.6622033898305083,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16031872509960163,
"calib/std_conf": 0.06852520151598002,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6015303119482048,
"calib/step_q_c_n": 1699.0,
"calib/step_q_gap": 0.006436615185002048,
"calib/step_q_w": 0.5950936967632028,
"calib/step_q_w_n": 1761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2558.0,
"completions/max_terminated_length": 2558.0,
"completions/mean_length": 812.390625,
"completions/mean_terminated_length": 825.2857666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0704,
"grad_norm": 0.2491704523563385,
"kl": 0.0764312744140625,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0821,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020575175061821938,
"mask/share_reasoning": 0.7718431353569031,
"mask/share_step_conf": 0.19195665419101715,
"num_tokens": 19765337.0,
"reward": 0.5751186609268188,
"reward_std": 0.24441185593605042,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7152577638626099,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.13576076924800873,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7632363438606262,
"adv/mean_abs_reasoning": 0.31024909019470215,
"adv/mean_abs_step_conf": 0.792525053024292,
"adv/ratio_final_to_reasoning": 2.4600760098334025,
"adv/ratio_step_to_reasoning": 2.5544798617360303,
"adv/std_final_conf": 0.9284982085227966,
"adv/std_reasoning": 0.5960694551467896,
"adv/std_step_conf": 0.9355476498603821,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 12.21484375,
"calib/ece": 0.13881422924901188,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0411336705202312,
"calib/mean_conf": 0.6584980237154151,
"calib/mu_c": 0.6454913294797688,
"calib/mu_w": 0.686625,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0567588932806324,
"calib/std_conf": 0.06758479519499076,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5716897506925208,
"calib/step_q_c_n": 1805.0,
"calib/step_q_gap": -0.03887757154651106,
"calib/step_q_w": 0.6105673222390319,
"calib/step_q_w_n": 1322.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2135.0,
"completions/max_terminated_length": 2135.0,
"completions/mean_length": 775.9296875,
"completions/mean_terminated_length": 785.1304931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.07146666666666666,
"grad_norm": 1.5712337493896484,
"kl": 0.0920257568359375,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0613,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.022031016647815704,
"mask/share_reasoning": 0.7744245529174805,
"mask/share_step_conf": 0.1918257176876068,
"num_tokens": 20068983.0,
"reward": 0.7031233310699463,
"reward_std": 0.20975123345851898,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7481851577758789,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3268115818500519,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.7644180059432983,
"adv/mean_abs_reasoning": 0.3880755305290222,
"adv/mean_abs_step_conf": 0.7951478958129883,
"adv/ratio_final_to_reasoning": 1.9697660527610394,
"adv/ratio_step_to_reasoning": 2.048951385131259,
"adv/std_final_conf": 0.9311574697494507,
"adv/std_reasoning": 0.6612011790275574,
"adv/std_step_conf": 0.9362030029296875,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.8125,
"calib/ece": 0.12948616600790516,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.011857707509881422,
"calib/gap": -0.048499346405228905,
"calib/mean_conf": 0.6574703557312254,
"calib/mu_c": 0.6383006535947712,
"calib/mu_w": 0.6868000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0911067193675889,
"calib/std_conf": 0.06495303951749451,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5713325330132053,
"calib/step_q_c_n": 1666.0,
"calib/step_q_gap": -0.04335767764354803,
"calib/step_q_w": 0.6146902106567533,
"calib/step_q_w_n": 1614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 727.9375,
"completions/mean_terminated_length": 739.4921264648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.2425469011068344,
"kl": 0.08228302001953125,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0256,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022897986695170403,
"mask/share_reasoning": 0.7602324485778809,
"mask/share_step_conf": 0.2012445628643036,
"num_tokens": 20359423.0,
"reward": 0.6331701874732971,
"reward_std": 0.21751436591148376,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.722222626209259,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.22693029046058655,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7844705581665039,
"adv/mean_abs_reasoning": 0.5006694793701172,
"adv/mean_abs_step_conf": 0.7850892543792725,
"adv/ratio_final_to_reasoning": 1.5668431779652945,
"adv/ratio_step_to_reasoning": 1.5680789157888722,
"adv/std_final_conf": 0.9311051368713379,
"adv/std_reasoning": 0.7394245862960815,
"adv/std_step_conf": 0.9362419247627258,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.64453125,
"calib/ece": 0.11515999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.027525703463203488,
"calib/mean_conf": 0.66044,
"calib/mu_c": 0.6498701298701298,
"calib/mu_w": 0.6773958333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07979999999999998,
"calib/std_conf": 0.06698810640703319,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5879117496151873,
"calib/step_q_c_n": 1949.0,
"calib/step_q_gap": -0.02638876851952776,
"calib/step_q_w": 0.6143005181347151,
"calib/step_q_w_n": 1544.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2495.0,
"completions/max_terminated_length": 2495.0,
"completions/mean_length": 813.2265625,
"completions/mean_terminated_length": 829.4263305664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.0736,
"grad_norm": 1.063038945198059,
"kl": 0.075531005859375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.1206,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019905662164092064,
"mask/share_reasoning": 0.7785724401473999,
"mask/share_step_conf": 0.1819905936717987,
"num_tokens": 20672105.0,
"reward": 0.6814106702804565,
"reward_std": 0.23180070519447327,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7257925868034363,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.3221849501132965,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7228235006332397,
"adv/mean_abs_reasoning": 0.33797919750213623,
"adv/mean_abs_step_conf": 0.7491491436958313,
"adv/ratio_final_to_reasoning": 2.1386626927791053,
"adv/ratio_step_to_reasoning": 2.2165540046028904,
"adv/std_final_conf": 0.9298471212387085,
"adv/std_reasoning": 0.661091148853302,
"adv/std_step_conf": 0.9359894394874573,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 12.90625,
"calib/ece": 0.08349397590361451,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.004016064257028112,
"calib/gap": -0.03341465062846338,
"calib/mean_conf": 0.6573895582329318,
"calib/mu_c": 0.6442384105960265,
"calib/mu_w": 0.6776530612244899,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06722891566265059,
"calib/std_conf": 0.06152686463401489,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5746817370612731,
"calib/step_q_c_n": 1681.0,
"calib/step_q_gap": -0.0379861618912839,
"calib/step_q_w": 0.612667898952557,
"calib/step_q_w_n": 1623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2506.0,
"completions/max_terminated_length": 2506.0,
"completions/mean_length": 781.09765625,
"completions/mean_terminated_length": 793.49609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.07466666666666667,
"grad_norm": 3.1837499141693115,
"kl": 0.10294342041015625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.1147,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.021590005606412888,
"mask/share_reasoning": 0.7688941955566406,
"mask/share_step_conf": 0.19389083981513977,
"num_tokens": 20979058.0,
"reward": 0.6882272362709045,
"reward_std": 0.1804431676864624,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.718786358833313,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.34516817331314087,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7644456624984741,
"adv/mean_abs_reasoning": 0.5488430261611938,
"adv/mean_abs_step_conf": 0.7760787010192871,
"adv/ratio_final_to_reasoning": 1.3928311485440252,
"adv/ratio_step_to_reasoning": 1.4140267144277328,
"adv/std_final_conf": 0.9317163228988647,
"adv/std_reasoning": 0.7927515506744385,
"adv/std_step_conf": 0.9363721013069153,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.6953125,
"calib/ece": 0.06669322709163347,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.01195219123505976,
"calib/gap": -0.014381833473507077,
"calib/mean_conf": 0.6547410358565737,
"calib/mu_c": 0.6497560975609756,
"calib/mu_w": 0.6641379310344827,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03402390438247012,
"calib/std_conf": 0.07459473955077608,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5889476284584981,
"calib/step_q_c_n": 2024.0,
"calib/step_q_gap": -0.013124149681795538,
"calib/step_q_w": 0.6020717781402937,
"calib/step_q_w_n": 1226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 742.265625,
"completions/mean_terminated_length": 757.0518188476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 271.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.25131160020828247,
"kl": 0.07721710205078125,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0556,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.023128561675548553,
"mask/share_reasoning": 0.7649872899055481,
"mask/share_step_conf": 0.19235289096832275,
"num_tokens": 21273486.0,
"reward": 0.6729413866996765,
"reward_std": 0.25942322611808777,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7465749979019165,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.2750890254974365,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7536088228225708,
"adv/mean_abs_reasoning": 0.30147671699523926,
"adv/mean_abs_step_conf": 0.7713176608085632,
"adv/ratio_final_to_reasoning": 2.4997247891434062,
"adv/ratio_step_to_reasoning": 2.5584651063476436,
"adv/std_final_conf": 0.9291418790817261,
"adv/std_reasoning": 0.5959498882293701,
"adv/std_step_conf": 0.9357434511184692,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.75390625,
"calib/ece": 0.08629921259842521,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.016833333333333256,
"calib/mean_conf": 0.6575590551181102,
"calib/mu_c": 0.6506666666666667,
"calib/mu_w": 0.6675,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07665354330708662,
"calib/std_conf": 0.051979955789050146,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5783435582822085,
"calib/step_q_c_n": 1793.0,
"calib/step_q_gap": -0.014781441717791388,
"calib/step_q_w": 0.5931249999999999,
"calib/step_q_w_n": 1472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1789.0,
"completions/max_terminated_length": 1789.0,
"completions/mean_length": 698.6796875,
"completions/mean_terminated_length": 704.1810913085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.0768,
"grad_norm": 0.2750079035758972,
"kl": 0.0861053466796875,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0094,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02209099940955639,
"mask/share_reasoning": 0.7602138519287109,
"mask/share_step_conf": 0.20988258719444275,
"num_tokens": 21556756.0,
"reward": 0.6741582155227661,
"reward_std": 0.19570280611515045,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7370632886886597,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.295628160238266,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7786300182342529,
"adv/mean_abs_reasoning": 0.35900551080703735,
"adv/mean_abs_step_conf": 0.7729635238647461,
"adv/ratio_final_to_reasoning": 2.1688525518282655,
"adv/ratio_step_to_reasoning": 2.153068687238642,
"adv/std_final_conf": 0.9293910264968872,
"adv/std_reasoning": 0.6186598539352417,
"adv/std_step_conf": 0.9356748461723328,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.515625,
"calib/ece": 0.14157480314960633,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.002855339105339083,
"calib/mean_conf": 0.6379527559055118,
"calib/mu_c": 0.6373232323232323,
"calib/mu_w": 0.6401785714285714,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.04520885187306327,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5724254742547426,
"calib/step_q_c_n": 2214.0,
"calib/step_q_gap": -0.01903229141283236,
"calib/step_q_w": 0.5914577656675749,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2374.0,
"completions/max_terminated_length": 2374.0,
"completions/mean_length": 683.625,
"completions/mean_terminated_length": 689.0078735351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 308.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.20970845222473145,
"kl": 0.08272552490234375,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0009,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02301781065762043,
"mask/share_reasoning": 0.7797709703445435,
"mask/share_step_conf": 0.18939867615699768,
"num_tokens": 21838796.0,
"reward": 0.7931956648826599,
"reward_std": 0.18731746077537537,
"rewards/accuracy_reward_step": 0.7734375,
"rewards/final_brier_reward_step": 0.7987773418426514,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.43448901176452637,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7864284515380859,
"adv/mean_abs_reasoning": 0.38745418190956116,
"adv/mean_abs_step_conf": 0.7657153606414795,
"adv/ratio_final_to_reasoning": 2.029732774239749,
"adv/ratio_step_to_reasoning": 1.9762733153831629,
"adv/std_final_conf": 0.9295899271965027,
"adv/std_reasoning": 0.6403974890708923,
"adv/std_step_conf": 0.9360736012458801,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.75,
"calib/ece": 0.16272000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03280922431865829,
"calib/mean_conf": 0.6427999999999999,
"calib/mu_c": 0.6288888888888888,
"calib/mu_w": 0.6616981132075471,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11475999999999999,
"calib/std_conf": 0.05854365892220951,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5696430931923331,
"calib/step_q_c_n": 1513.0,
"calib/step_q_gap": -0.044154736619203216,
"calib/step_q_w": 0.6137978298115363,
"calib/step_q_w_n": 1751.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2922.0,
"completions/max_terminated_length": 2922.0,
"completions/mean_length": 679.0703125,
"completions/mean_terminated_length": 695.3680419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.24181662499904633,
"kl": 0.09625244140625,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0788,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02417697384953499,
"mask/share_reasoning": 0.7582485675811768,
"mask/share_step_conf": 0.19413697719573975,
"num_tokens": 22116566.0,
"reward": 0.6208184361457825,
"reward_std": 0.21420027315616608,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7147077918052673,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.21911655366420746,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7937130928039551,
"adv/mean_abs_reasoning": 0.3079615533351898,
"adv/mean_abs_step_conf": 0.7679805755615234,
"adv/ratio_final_to_reasoning": 2.5773122787832747,
"adv/ratio_step_to_reasoning": 2.4937547146531056,
"adv/std_final_conf": 0.926730215549469,
"adv/std_reasoning": 0.5726933479309082,
"adv/std_step_conf": 0.9360403418540955,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.41796875,
"calib/ece": 0.20246093749999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02472130546387974,
"calib/mean_conf": 0.6330859375,
"calib/mu_c": 0.6278712871287129,
"calib/mu_w": 0.6525925925925926,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.023242187499999983,
"calib/std_conf": 0.04643805136680581,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5746615523465706,
"calib/step_q_c_n": 2216.0,
"calib/step_q_gap": -0.007813695178181868,
"calib/step_q_w": 0.5824752475247524,
"calib/step_q_w_n": 707.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1678.0,
"completions/max_terminated_length": 1678.0,
"completions/mean_length": 643.375,
"completions/mean_terminated_length": 648.44091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.08,
"grad_norm": 0.1915205866098404,
"kl": 0.0908966064453125,
"learning_rate": 3.5e-06,
"loss": 0.0015,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.024786923080682755,
"mask/share_reasoning": 0.7751480340957642,
"mask/share_step_conf": 0.192252516746521,
"num_tokens": 22386022.0,
"reward": 0.7966657876968384,
"reward_std": 0.18921825289726257,
"rewards/accuracy_reward_step": 0.7890625,
"rewards/final_brier_reward_step": 0.7988425493240356,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.4366765022277832,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.751955509185791,
"adv/mean_abs_reasoning": 0.3741779029369354,
"adv/mean_abs_step_conf": 0.7689213156700134,
"adv/ratio_final_to_reasoning": 2.0096202990173015,
"adv/ratio_step_to_reasoning": 2.0549618500577482,
"adv/std_final_conf": 0.9292881488800049,
"adv/std_reasoning": 0.6612319350242615,
"adv/std_step_conf": 0.9360610842704773,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 12.3515625,
"calib/ece": 0.06920634920634912,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003968253968253968,
"calib/gap": -0.019063157894736893,
"calib/mean_conf": 0.6423015873015874,
"calib/mu_c": 0.6347368421052632,
"calib/mu_w": 0.6538,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05416666666666666,
"calib/std_conf": 0.05576541993899569,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5709497882637629,
"calib/step_q_c_n": 1653.0,
"calib/step_q_gap": -0.03404027137838428,
"calib/step_q_w": 0.6049900596421471,
"calib/step_q_w_n": 1509.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1663.0,
"completions/max_terminated_length": 1663.0,
"completions/mean_length": 668.32421875,
"completions/mean_terminated_length": 681.637451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.19638362526893616,
"kl": 0.0817718505859375,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0522,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.024196840822696686,
"mask/share_reasoning": 0.7593661546707153,
"mask/share_step_conf": 0.196905717253685,
"num_tokens": 22660169.0,
"reward": 0.6167426109313965,
"reward_std": 0.22435541450977325,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7352085709571838,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.1826515942811966,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7270142436027527,
"adv/mean_abs_reasoning": 0.47899097204208374,
"adv/mean_abs_step_conf": 0.7548226118087769,
"adv/ratio_final_to_reasoning": 1.5178036456580184,
"adv/ratio_step_to_reasoning": 1.5758597883186383,
"adv/std_final_conf": 0.9317358136177063,
"adv/std_reasoning": 0.7575597763061523,
"adv/std_step_conf": 0.9361504316329956,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.13671875,
"calib/ece": 0.09701195219123503,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.00398406374501992,
"calib/gap": -0.018355166323226202,
"calib/mean_conf": 0.6479282868525896,
"calib/mu_c": 0.6421511627906977,
"calib/mu_w": 0.6605063291139239,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02984063745019921,
"calib/std_conf": 0.06739083571007691,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5808516746411483,
"calib/step_q_c_n": 2090.0,
"calib/step_q_gap": -0.040012425908733884,
"calib/step_q_w": 0.6208641005498822,
"calib/step_q_w_n": 1273.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 678.09375,
"completions/mean_terminated_length": 691.6016235351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.7611878514289856,
"kl": 0.1143951416015625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0598,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.025160975754261017,
"mask/share_reasoning": 0.7508993744850159,
"mask/share_step_conf": 0.2044084072113037,
"num_tokens": 22938425.0,
"reward": 0.7108245491981506,
"reward_std": 0.25719261169433594,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.755419909954071,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3357604146003723,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7609703540802002,
"adv/mean_abs_reasoning": 0.4544995129108429,
"adv/mean_abs_step_conf": 0.7707890272140503,
"adv/ratio_final_to_reasoning": 1.6743040035545127,
"adv/ratio_step_to_reasoning": 1.6959072679253948,
"adv/std_final_conf": 0.9323300719261169,
"adv/std_reasoning": 0.7205326557159424,
"adv/std_step_conf": 0.9358113408088684,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.26953125,
"calib/ece": 0.0807874015748031,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.015016769865841106,
"calib/mean_conf": 0.6534645669291338,
"calib/mu_c": 0.6474342105263158,
"calib/mu_w": 0.6624509803921569,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0679133858267716,
"calib/std_conf": 0.053893712147748914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5778175026680896,
"calib/step_q_c_n": 1874.0,
"calib/step_q_gap": -0.020140475007550562,
"calib/step_q_w": 0.5979579776756402,
"calib/step_q_w_n": 1523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2000.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 776.51171875,
"completions/mean_terminated_length": 782.6259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.0832,
"grad_norm": 0.24049845337867737,
"kl": 0.0765838623046875,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0022,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020861711353063583,
"mask/share_reasoning": 0.7841784358024597,
"mask/share_step_conf": 0.1871473640203476,
"num_tokens": 23245236.0,
"reward": 0.6060810089111328,
"reward_std": 0.21829815208911896,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7407039403915405,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.15427061915397644,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.7370405197143555,
"adv/mean_abs_reasoning": 0.42967432737350464,
"adv/mean_abs_step_conf": 0.7801761627197266,
"adv/ratio_final_to_reasoning": 1.7153468866052717,
"adv/ratio_step_to_reasoning": 1.8157383697759066,
"adv/std_final_conf": 0.9302157163619995,
"adv/std_reasoning": 0.7205436825752258,
"adv/std_step_conf": 0.9363252520561218,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.58203125,
"calib/ece": 0.029561752988047797,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.01458523119392685,
"calib/mean_conf": 0.6455776892430278,
"calib/mu_c": 0.6508074534161491,
"calib/mu_w": 0.6362222222222222,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.016852589641434237,
"calib/std_conf": 0.05186525735554406,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5753923205342237,
"calib/step_q_c_n": 1797.0,
"calib/step_q_gap": -0.05129307272420325,
"calib/step_q_w": 0.626685393258427,
"calib/step_q_w_n": 1424.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2973.0,
"completions/max_terminated_length": 2973.0,
"completions/mean_length": 723.109375,
"completions/mean_terminated_length": 737.5139770507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.21749094128608704,
"kl": 0.08119964599609375,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0451,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.021755652502179146,
"mask/share_reasoning": 0.7732414603233337,
"mask/share_step_conf": 0.18547168374061584,
"num_tokens": 23536728.0,
"reward": 0.639240026473999,
"reward_std": 0.2535552978515625,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7588882446289062,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.1977168172597885,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.7694152593612671,
"adv/mean_abs_reasoning": 0.3173452615737915,
"adv/mean_abs_step_conf": 0.7661705017089844,
"adv/ratio_final_to_reasoning": 2.4245367822590187,
"adv/ratio_step_to_reasoning": 2.414312089959562,
"adv/std_final_conf": 0.9289205074310303,
"adv/std_reasoning": 0.5960049033164978,
"adv/std_step_conf": 0.9357802271842957,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.53125,
"calib/ece": 0.10195312500000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0006266706266706557,
"calib/mean_conf": 0.638203125,
"calib/mu_c": 0.638021978021978,
"calib/mu_w": 0.6386486486486487,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.014609374999999994,
"calib/std_conf": 0.04810362502176292,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5719396551724139,
"calib/step_q_c_n": 2088.0,
"calib/step_q_gap": -0.0060117337164750095,
"calib/step_q_w": 0.5779513888888889,
"calib/step_q_w_n": 864.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1680.0,
"completions/max_terminated_length": 1680.0,
"completions/mean_length": 625.4609375,
"completions/mean_terminated_length": 630.3858032226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.32449737191200256,
"kl": 0.0968017578125,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0326,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02532077580690384,
"mask/share_reasoning": 0.7636048197746277,
"mask/share_step_conf": 0.20326188206672668,
"num_tokens": 23799006.0,
"reward": 0.6887508630752563,
"reward_std": 0.20290717482566833,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.786632776260376,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.2486814260482788,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.7590487003326416,
"adv/mean_abs_reasoning": 0.3880404531955719,
"adv/mean_abs_step_conf": 0.7771025896072388,
"adv/ratio_final_to_reasoning": 1.9561071380103816,
"adv/ratio_step_to_reasoning": 2.002632929653806,
"adv/std_final_conf": 0.9300270676612854,
"adv/std_reasoning": 0.6612900495529175,
"adv/std_step_conf": 0.9357220530509949,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.39453125,
"calib/ece": 0.13869565217391305,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.023583829365079412,
"calib/mean_conf": 0.6466007905138341,
"calib/mu_c": 0.6406349206349206,
"calib/mu_w": 0.66421875,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.019130434782608677,
"calib/std_conf": 0.049681860594208434,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5843677204658903,
"calib/step_q_c_n": 2404.0,
"calib/step_q_gap": -0.015905450265817023,
"calib/step_q_w": 0.6002731707317073,
"calib/step_q_w_n": 1025.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2944.0,
"completions/max_terminated_length": 2944.0,
"completions/mean_length": 733.7890625,
"completions/mean_terminated_length": 739.5669555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.0864,
"grad_norm": 0.23874109983444214,
"kl": 0.078399658203125,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0096,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02268517017364502,
"mask/share_reasoning": 0.7729635834693909,
"mask/share_step_conf": 0.19653871655464172,
"num_tokens": 24093104.0,
"reward": 0.7508682012557983,
"reward_std": 0.2056404948234558,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7783128619194031,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3788922429084778,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.7448205947875977,
"adv/mean_abs_reasoning": 0.41496533155441284,
"adv/mean_abs_step_conf": 0.7517217397689819,
"adv/ratio_final_to_reasoning": 1.7948983641539031,
"adv/ratio_step_to_reasoning": 1.8115290184679236,
"adv/std_final_conf": 0.9288557171821594,
"adv/std_reasoning": 0.7013262510299683,
"adv/std_step_conf": 0.9361816644668579,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.83984375,
"calib/ece": 0.05484000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.016202805499336947,
"calib/mean_conf": 0.63788,
"calib/mu_c": 0.6321118012422361,
"calib/mu_w": 0.648314606741573,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02436000000000001,
"calib/std_conf": 0.041932154726414914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5694038668098819,
"calib/step_q_c_n": 1862.0,
"calib/step_q_gap": -0.03523472968134622,
"calib/step_q_w": 0.6046385964912281,
"calib/step_q_w_n": 1425.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 647.83203125,
"completions/mean_terminated_length": 660.737060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 336.0,
"epoch": 0.08746666666666666,
"grad_norm": 8.346512794494629,
"kl": 0.209014892578125,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.0679,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02369321510195732,
"mask/share_reasoning": 0.7597755193710327,
"mask/share_step_conf": 0.19700007140636444,
"num_tokens": 24364501.0,
"reward": 0.6778282523155212,
"reward_std": 0.2450428605079651,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7436628937721252,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.2908998727798462,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.7520098686218262,
"adv/mean_abs_reasoning": 0.2951866388320923,
"adv/mean_abs_step_conf": 0.7480201721191406,
"adv/ratio_final_to_reasoning": 2.547574211343568,
"adv/ratio_step_to_reasoning": 2.534058367542267,
"adv/std_final_conf": 0.9304251074790955,
"adv/std_reasoning": 0.5961121320724487,
"adv/std_step_conf": 0.9359679222106934,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 14.26171875,
"calib/ece": 0.13076612903225818,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.029567307692307865,
"calib/mean_conf": 0.6555241935483871,
"calib/mu_c": 0.643125,
"calib/mu_w": 0.6726923076923078,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10282258064516131,
"calib/std_conf": 0.05380414957460447,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5741127232142856,
"calib/step_q_c_n": 1792.0,
"calib/step_q_gap": -0.04678022998636,
"calib/step_q_w": 0.6208929532006456,
"calib/step_q_w_n": 1859.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 789.95703125,
"completions/mean_terminated_length": 812.1646118164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.1789998710155487,
"kl": 0.07549285888671875,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0828,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02037701942026615,
"mask/share_reasoning": 0.7650948166847229,
"mask/share_step_conf": 0.1871844232082367,
"num_tokens": 24673994.0,
"reward": 0.6012614965438843,
"reward_std": 0.20678496360778809,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7106777429580688,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.18559530377388,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.7646782398223877,
"adv/mean_abs_reasoning": 0.36597129702568054,
"adv/mean_abs_step_conf": 0.7656229734420776,
"adv/ratio_final_to_reasoning": 2.0894486699833443,
"adv/ratio_step_to_reasoning": 2.0920301118269204,
"adv/std_final_conf": 0.9289400577545166,
"adv/std_reasoning": 0.6403570771217346,
"adv/std_step_conf": 0.9360085129737854,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.29296875,
"calib/ece": 0.05787148594377517,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.009090157154673206,
"calib/mean_conf": 0.6423694779116466,
"calib/mu_c": 0.6389743589743591,
"calib/mu_w": 0.6480645161290323,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03686746987951806,
"calib/std_conf": 0.04789135998877891,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5734943639291465,
"calib/step_q_c_n": 1863.0,
"calib/step_q_gap": -0.044200441265658696,
"calib/step_q_w": 0.6176948051948052,
"calib/step_q_w_n": 1540.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2975.0,
"completions/max_terminated_length": 2975.0,
"completions/mean_length": 675.76171875,
"completions/mean_terminated_length": 691.9800415039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0896,
"grad_norm": 0.24721167981624603,
"kl": 0.08000946044921875,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02393614500761032,
"mask/share_reasoning": 0.7570117712020874,
"mask/share_step_conf": 0.19561460614204407,
"num_tokens": 24952909.0,
"reward": 0.6713294386863708,
"reward_std": 0.21048003435134888,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7384449243545532,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.2878076434135437,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.7614791393280029,
"adv/mean_abs_reasoning": 0.4133550226688385,
"adv/mean_abs_step_conf": 0.7623780369758606,
"adv/ratio_final_to_reasoning": 1.8421915727828615,
"adv/ratio_step_to_reasoning": 1.8443662110443102,
"adv/std_final_conf": 0.9300859570503235,
"adv/std_reasoning": 0.681785523891449,
"adv/std_step_conf": 0.9362416863441467,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 16.1171875,
"calib/ece": 0.08950413223140495,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.015131104967747167,
"calib/mean_conf": 0.6545454545454544,
"calib/mu_c": 0.6502312138728324,
"calib/mu_w": 0.6653623188405796,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.014586776859504125,
"calib/std_conf": 0.05056703267335774,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5807871591908531,
"calib/step_q_c_n": 2274.0,
"calib/step_q_gap": -0.08242558378970843,
"calib/step_q_w": 0.6632127429805615,
"calib/step_q_w_n": 1852.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 736.36328125,
"completions/mean_terminated_length": 775.7572021484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.2139563411474228,
"kl": 0.065704345703125,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.1757,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.020088504999876022,
"mask/share_reasoning": 0.7408749461174011,
"mask/share_step_conf": 0.18825532495975494,
"num_tokens": 25249242.0,
"reward": 0.7280115485191345,
"reward_std": 0.23207849264144897,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7409422397613525,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.3908621668815613,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7666450142860413,
"adv/mean_abs_reasoning": 0.47989705204963684,
"adv/mean_abs_step_conf": 0.7776232957839966,
"adv/ratio_final_to_reasoning": 1.5975197409771658,
"adv/ratio_step_to_reasoning": 1.6203960671622655,
"adv/std_final_conf": 0.9320396184921265,
"adv/std_reasoning": 0.7395176887512207,
"adv/std_step_conf": 0.9357236623764038,
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 17.3359375,
"calib/ece": 0.13842553191489365,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.022039048737521938,
"calib/mean_conf": 0.6698297872340425,
"calib/mu_c": 0.6600763358778626,
"calib/mu_w": 0.6821153846153846,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12540425531914895,
"calib/std_conf": 0.06703423042663278,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5864088397790056,
"calib/step_q_c_n": 1810.0,
"calib/step_q_gap": -0.0714184052742668,
"calib/step_q_w": 0.6578272450532724,
"calib/step_q_w_n": 2628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2532.0,
"completions/max_terminated_length": 2532.0,
"completions/mean_length": 797.66015625,
"completions/mean_terminated_length": 861.6075439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.2806868255138397,
"kl": 0.06467437744140625,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.2478,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.019146300852298737,
"mask/share_reasoning": 0.7248205542564392,
"mask/share_step_conf": 0.18181441724300385,
"num_tokens": 25558955.0,
"reward": 0.5457245111465454,
"reward_std": 0.25168851017951965,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6658051013946533,
"rewards/format_reward_step": 0.91796875,
"rewards/step_correlation_reward": 0.13970647752285004,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.7346113920211792,
"adv/mean_abs_reasoning": 0.46212661266326904,
"adv/mean_abs_step_conf": 0.7368422746658325,
"adv/ratio_final_to_reasoning": 1.5896323039860456,
"adv/ratio_step_to_reasoning": 1.5944597313263507,
"adv/std_final_conf": 0.9318578243255615,
"adv/std_reasoning": 0.7396441102027893,
"adv/std_step_conf": 0.9363143444061279,
"calib/answer_extract_rate": 0.890625,
"calib/avg_num_step_conf": 17.08203125,
"calib/ece": 0.14903508771929824,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.009928332537028073,
"calib/mean_conf": 0.6514035087719299,
"calib/mu_c": 0.6534065934065934,
"calib/mu_w": 0.6434782608695653,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0010964912280701754,
"calib/std_conf": 0.053702840478021444,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5824095394736842,
"calib/step_q_c_n": 2432.0,
"calib/step_q_gap": -0.10879087268499688,
"calib/step_q_w": 0.6912004121586811,
"calib/step_q_w_n": 1941.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3016.0,
"completions/max_terminated_length": 3016.0,
"completions/mean_length": 658.2109375,
"completions/mean_terminated_length": 735.8165893554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.0928,
"grad_norm": 0.5631839036941528,
"kl": 0.074859619140625,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.3261,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.020042482763528824,
"mask/share_reasoning": 0.6945275068283081,
"mask/share_step_conf": 0.17996126413345337,
"num_tokens": 25832953.0,
"reward": 0.7213019132614136,
"reward_std": 0.2892686724662781,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.728265643119812,
"rewards/format_reward_step": 0.890625,
"rewards/step_correlation_reward": 0.3940257430076599,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.7458748817443848,
"adv/mean_abs_reasoning": 0.4192622900009155,
"adv/mean_abs_step_conf": 0.7751203775405884,
"adv/ratio_final_to_reasoning": 1.7790173348114757,
"adv/ratio_step_to_reasoning": 1.8487719883867824,
"adv/std_final_conf": 0.9322258830070496,
"adv/std_reasoning": 0.7017030715942383,
"adv/std_step_conf": 0.9357836842536926,
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 17.6484375,
"calib/ece": 0.0789029535864979,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0019180790960452443,
"calib/mean_conf": 0.6679324894514769,
"calib/mu_c": 0.6684180790960452,
"calib/mu_w": 0.6665,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05338685025245993,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5890498625834315,
"calib/step_q_c_n": 2547.0,
"calib/step_q_gap": -0.08474567267785726,
"calib/step_q_w": 0.6737955352612888,
"calib/step_q_w_n": 1971.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2984.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 778.875,
"completions/mean_terminated_length": 837.7815551757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.2500193119049072,
"kl": 0.06778717041015625,
"learning_rate": 3.138888888888889e-06,
"loss": -0.2581,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017798837274312973,
"mask/share_reasoning": 0.7285441160202026,
"mask/share_step_conf": 0.1833444982767105,
"num_tokens": 26142193.0,
"reward": 0.784960150718689,
"reward_std": 0.2499820590019226,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.743010938167572,
"rewards/format_reward_step": 0.92578125,
"rewards/step_correlation_reward": 0.5034719109535217,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.7492777109146118,
"adv/mean_abs_reasoning": 0.6360772848129272,
"adv/mean_abs_step_conf": 0.7455618381500244,
"adv/ratio_final_to_reasoning": 1.177966465403614,
"adv/ratio_step_to_reasoning": 1.1721246080486853,
"adv/std_final_conf": 0.9346681833267212,
"adv/std_reasoning": 0.859366774559021,
"adv/std_step_conf": 0.9364317655563354,
"calib/answer_extract_rate": 0.796875,
"calib/avg_num_step_conf": 22.46875,
"calib/ece": 0.12450980392156871,
"calib/final_conf_rate": 0.796875,
"calib/format_rate": 0.796875,
"calib/frac_conf_gt_0.9": 0.004901960784313725,
"calib/gap": -0.03393914043710122,
"calib/mean_conf": 0.6731372549019607,
"calib/mu_c": 0.6609923664122138,
"calib/mu_w": 0.694931506849315,
"calib/nonempty_final_conf_rate": 0.796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07774509803921568,
"calib/std_conf": 0.060811189504358695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.58310119695321,
"calib/step_q_c_n": 1838.0,
"calib/step_q_gap": -0.11488296247448548,
"calib/step_q_w": 0.6979841594276955,
"calib/step_q_w_n": 3914.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19921875,
"completions/max_length": 2682.0,
"completions/max_terminated_length": 2682.0,
"completions/mean_length": 679.6015625,
"completions/mean_terminated_length": 848.6731567382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.9176256656646729,
"kl": 0.10498046875,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.6332,
"mask/has_final_conf_rate": 0.796875,
"mask/share_final_conf": 0.015897078439593315,
"mask/share_reasoning": 0.6220657825469971,
"mask/share_step_conf": 0.16281834244728088,
"num_tokens": 26425059.0,
"reward": 0.5518848896026611,
"reward_std": 0.3001466393470764,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5976187586784363,
"rewards/format_reward_step": 0.796875,
"rewards/step_correlation_reward": 0.24443230032920837,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7582546472549438,
"adv/mean_abs_reasoning": 0.5267269015312195,
"adv/mean_abs_step_conf": 0.748308539390564,
"adv/ratio_final_to_reasoning": 1.4395593713756836,
"adv/ratio_step_to_reasoning": 1.4206765160754016,
"adv/std_final_conf": 0.9336482882499695,
"adv/std_reasoning": 0.7757920622825623,
"adv/std_step_conf": 0.9350053071975708,
"calib/answer_extract_rate": 0.84375,
"calib/avg_num_step_conf": 22.69921875,
"calib/ece": 0.10333333333333322,
"calib/final_conf_rate": 0.84375,
"calib/format_rate": 0.84375,
"calib/frac_conf_gt_0.9": 0.004629629629629629,
"calib/gap": -0.005089285714285685,
"calib/mean_conf": 0.6758333333333333,
"calib/mu_c": 0.6747023809523809,
"calib/mu_w": 0.6797916666666666,
"calib/nonempty_final_conf_rate": 0.84375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0006944444444444461,
"calib/std_conf": 0.06474486966662414,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5961133768352366,
"calib/step_q_c_n": 2452.0,
"calib/step_q_gap": -0.12455051122668659,
"calib/step_q_w": 0.7206638880619232,
"calib/step_q_w_n": 3359.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 714.82421875,
"completions/mean_terminated_length": 843.294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.096,
"grad_norm": 0.6938077807426453,
"kl": 0.066070556640625,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.5306,
"mask/has_final_conf_rate": 0.84375,
"mask/share_final_conf": 0.01659608632326126,
"mask/share_reasoning": 0.6598981618881226,
"mask/share_step_conf": 0.1711619645357132,
"num_tokens": 26711374.0,
"reward": 0.7062116265296936,
"reward_std": 0.26097801327705383,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6841264963150024,
"rewards/format_reward_step": 0.84375,
"rewards/step_correlation_reward": 0.4282967746257782,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7301865816116333,
"adv/mean_abs_reasoning": 0.39002954959869385,
"adv/mean_abs_step_conf": 0.7586683630943298,
"adv/ratio_final_to_reasoning": 1.8721314381511123,
"adv/ratio_step_to_reasoning": 1.9451561141327187,
"adv/std_final_conf": 0.9313138127326965,
"adv/std_reasoning": 0.68172287940979,
"adv/std_step_conf": 0.9356062412261963,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 15.78125,
"calib/ece": 0.08085020242914978,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0170270486555697,
"calib/mean_conf": 0.6633603238866397,
"calib/mu_c": 0.6584659090909091,
"calib/mu_w": 0.6754929577464788,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01582995951417004,
"calib/std_conf": 0.051766797667866085,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5833746385790995,
"calib/step_q_c_n": 2421.0,
"calib/step_q_gap": -0.05881189631898576,
"calib/step_q_w": 0.6421865348980853,
"calib/step_q_w_n": 1619.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2890.0,
"completions/max_terminated_length": 2890.0,
"completions/mean_length": 859.53125,
"completions/mean_terminated_length": 883.6947631835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.14019861817359924,
"kl": 0.05571746826171875,
"learning_rate": 3.055555555555556e-06,
"loss": -0.1903,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017590008676052094,
"mask/share_reasoning": 0.7758537530899048,
"mask/share_step_conf": 0.1792125105857849,
"num_tokens": 27039126.0,
"reward": 0.7212048768997192,
"reward_std": 0.21607019007205963,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7555722594261169,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.3563687205314636,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.7461353540420532,
"adv/mean_abs_reasoning": 0.33928218483924866,
"adv/mean_abs_step_conf": 0.7547565698623657,
"adv/ratio_final_to_reasoning": 2.199158657256263,
"adv/ratio_step_to_reasoning": 2.2245688208473666,
"adv/std_final_conf": 0.9299798011779785,
"adv/std_reasoning": 0.6403141021728516,
"adv/std_step_conf": 0.9356287717819214,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.00390625,
"calib/ece": 0.1760956175298805,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.00398406374501992,
"calib/gap": -0.027140638481449653,
"calib/mean_conf": 0.6447011952191236,
"calib/mu_c": 0.6381052631578947,
"calib/mu_w": 0.6652459016393444,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03191235059760955,
"calib/std_conf": 0.056542216701220574,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.576131993006993,
"calib/step_q_c_n": 2288.0,
"calib/step_q_gap": -0.04979500026870343,
"calib/step_q_w": 0.6259269932756965,
"calib/step_q_w_n": 1041.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 756.109375,
"completions/mean_terminated_length": 765.0751342773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 288.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.1883665770292282,
"kl": 0.0677642822265625,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0579,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.021719403564929962,
"mask/share_reasoning": 0.7846254110336304,
"mask/share_step_conf": 0.18193641304969788,
"num_tokens": 27339410.0,
"reward": 0.7869164943695068,
"reward_std": 0.21456190943717957,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7748124599456787,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.45448917150497437,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.755165696144104,
"adv/mean_abs_reasoning": 0.4527568519115448,
"adv/mean_abs_step_conf": 0.7692996859550476,
"adv/ratio_final_to_reasoning": 1.6679277032601174,
"adv/ratio_step_to_reasoning": 1.6991453198489548,
"adv/std_final_conf": 0.9280458092689514,
"adv/std_reasoning": 0.7207328677177429,
"adv/std_step_conf": 0.935437023639679,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 13.69921875,
"calib/ece": 0.1453658536585366,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.026574675324675168,
"calib/mean_conf": 0.648130081300813,
"calib/mu_c": 0.6405681818181818,
"calib/mu_w": 0.6671428571428569,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03902439024390242,
"calib/std_conf": 0.05590860127691766,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5751223091976517,
"calib/step_q_c_n": 2044.0,
"calib/step_q_gap": -0.05254002846468597,
"calib/step_q_w": 0.6276623376623377,
"calib/step_q_w_n": 1463.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2094.0,
"completions/max_terminated_length": 2094.0,
"completions/mean_length": 738.7734375,
"completions/mean_terminated_length": 768.8048706054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.0992,
"grad_norm": 0.18260808289051056,
"kl": 0.063751220703125,
"learning_rate": 3e-06,
"loss": -0.1469,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02047555334866047,
"mask/share_reasoning": 0.7621718645095825,
"mask/share_step_conf": 0.17829002439975739,
"num_tokens": 27634312.0,
"reward": 0.7290872931480408,
"reward_std": 0.24152934551239014,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7475515604019165,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.38093554973602295,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.7560976147651672,
"adv/mean_abs_reasoning": 0.2771226167678833,
"adv/mean_abs_step_conf": 0.7483853101730347,
"adv/ratio_final_to_reasoning": 2.7283865300625076,
"adv/ratio_step_to_reasoning": 2.700556594411343,
"adv/std_final_conf": 0.9292981028556824,
"adv/std_reasoning": 0.5726613402366638,
"adv/std_step_conf": 0.9354825615882874,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.74609375,
"calib/ece": 0.033992094861660126,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.008102249488752489,
"calib/mean_conf": 0.6394466403162056,
"calib/mu_c": 0.6365644171779141,
"calib/mu_w": 0.6446666666666666,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.014584980237154203,
"calib/std_conf": 0.04471793592128724,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5735790543975597,
"calib/step_q_c_n": 1967.0,
"calib/step_q_gap": -0.01689934066416854,
"calib/step_q_w": 0.5904783950617283,
"calib/step_q_w_n": 1296.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 730.92578125,
"completions/mean_terminated_length": 739.5928955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 263.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.14898745715618134,
"kl": 0.06661224365234375,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0483,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021695515140891075,
"mask/share_reasoning": 0.7875087261199951,
"mask/share_step_conf": 0.17907699942588806,
"num_tokens": 27930109.0,
"reward": 0.6933168172836304,
"reward_std": 0.15646381676197052,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7561109066009521,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.30552273988723755,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.7589545845985413,
"adv/mean_abs_reasoning": 0.41713064908981323,
"adv/mean_abs_step_conf": 0.7742445468902588,
"adv/ratio_final_to_reasoning": 1.8194649236506455,
"adv/ratio_step_to_reasoning": 1.8561200155866626,
"adv/std_final_conf": 0.9303404688835144,
"adv/std_reasoning": 0.7013149261474609,
"adv/std_step_conf": 0.9355771541595459,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.4296875,
"calib/ece": 0.09443999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.004,
"calib/gap": -0.024019047619047496,
"calib/mean_conf": 0.6593199999999999,
"calib/mu_c": 0.6521142857142858,
"calib/mu_w": 0.6761333333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02688,
"calib/std_conf": 0.05826780929466974,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5761407652338215,
"calib/step_q_c_n": 2117.0,
"calib/step_q_gap": -0.03365484415300668,
"calib/step_q_w": 0.6097956093868282,
"calib/step_q_w_n": 1321.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2747.0,
"completions/max_terminated_length": 2747.0,
"completions/mean_length": 832.6953125,
"completions/mean_terminated_length": 849.2828979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.18183837831020355,
"kl": 0.0625762939453125,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0559,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019320238381624222,
"mask/share_reasoning": 0.7809579372406006,
"mask/share_step_conf": 0.1801905632019043,
"num_tokens": 28249407.0,
"reward": 0.6880050897598267,
"reward_std": 0.2226249724626541,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7567011117935181,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.28727781772613525,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.7273871302604675,
"adv/mean_abs_reasoning": 0.32089483737945557,
"adv/mean_abs_step_conf": 0.755443811416626,
"adv/ratio_final_to_reasoning": 2.2667461284219357,
"adv/ratio_step_to_reasoning": 2.354178763316531,
"adv/std_final_conf": 0.9287962317466736,
"adv/std_reasoning": 0.6401641368865967,
"adv/std_step_conf": 0.9360931515693665,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.828125,
"calib/ece": 0.20127490039840634,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.00796812749003984,
"calib/gap": -0.049510761439681805,
"calib/mean_conf": 0.6471713147410358,
"calib/mu_c": 0.6359278350515464,
"calib/mu_w": 0.6854385964912282,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03776892430278886,
"calib/std_conf": 0.04955143946699769,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5711596119929454,
"calib/step_q_c_n": 2268.0,
"calib/step_q_gap": -0.03606479745587354,
"calib/step_q_w": 0.607224409448819,
"calib/step_q_w_n": 1016.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2896.0,
"completions/max_terminated_length": 2896.0,
"completions/mean_length": 728.52734375,
"completions/mean_terminated_length": 746.0120239257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.1024,
"grad_norm": 0.1968483030796051,
"kl": 0.07297515869140625,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0946,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020755115896463394,
"mask/share_reasoning": 0.7726129293441772,
"mask/share_step_conf": 0.18319444358348846,
"num_tokens": 28541726.0,
"reward": 0.7672200202941895,
"reward_std": 0.1859196275472641,
"rewards/accuracy_reward_step": 0.7578125,
"rewards/final_brier_reward_step": 0.7734265327453613,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.4133572578430176,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.7504986524581909,
"adv/mean_abs_reasoning": 0.44124436378479004,
"adv/mean_abs_step_conf": 0.771976113319397,
"adv/ratio_final_to_reasoning": 1.7008685301286586,
"adv/ratio_step_to_reasoning": 1.7495432841288736,
"adv/std_final_conf": 0.9311988353729248,
"adv/std_reasoning": 0.7205365300178528,
"adv/std_step_conf": 0.9355778098106384,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 12.8828125,
"calib/ece": 0.034722222222222224,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0023157747268144346,
"calib/mean_conf": 0.6444047619047618,
"calib/mu_c": 0.6436144578313252,
"calib/mu_w": 0.6459302325581396,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010198412698412694,
"calib/std_conf": 0.04115450535066114,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5757505995203838,
"calib/step_q_c_n": 2085.0,
"calib/step_q_gap": -0.011545360908305491,
"calib/step_q_w": 0.5872959604286893,
"calib/step_q_w_n": 1213.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 753.28125,
"completions/mean_terminated_length": 762.2134399414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.17708462476730347,
"kl": 0.0679931640625,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0231,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020922204479575157,
"mask/share_reasoning": 0.7850449085235596,
"mask/share_step_conf": 0.1823141723871231,
"num_tokens": 28839638.0,
"reward": 0.7043203115463257,
"reward_std": 0.244666188955307,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7601886987686157,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.32188940048217773,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.758463442325592,
"adv/mean_abs_reasoning": 0.44855350255966187,
"adv/mean_abs_step_conf": 0.7728298306465149,
"adv/ratio_final_to_reasoning": 1.690909641764996,
"adv/ratio_step_to_reasoning": 1.7229379020259044,
"adv/std_final_conf": 0.9319073557853699,
"adv/std_reasoning": 0.7205816507339478,
"adv/std_step_conf": 0.9360450506210327,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.90625,
"calib/ece": 0.0566269841269842,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.006484848484848538,
"calib/mean_conf": 0.6557539682539683,
"calib/mu_c": 0.6535151515151515,
"calib/mu_w": 0.66,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.028809523809523823,
"calib/std_conf": 0.04955983821218589,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5806123388581953,
"calib/step_q_c_n": 2172.0,
"calib/step_q_gap": -0.022355960853620238,
"calib/step_q_w": 0.6029682997118155,
"calib/step_q_w_n": 1388.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2082.0,
"completions/max_terminated_length": 2082.0,
"completions/mean_length": 809.77734375,
"completions/mean_terminated_length": 822.6309814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.36100202798843384,
"kl": 0.07733917236328125,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0337,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019724491983652115,
"mask/share_reasoning": 0.7813058495521545,
"mask/share_step_conf": 0.18334467709064484,
"num_tokens": 29153125.0,
"reward": 0.6995540261268616,
"reward_std": 0.24638350307941437,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7565535306930542,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3167732357978821,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7385193109512329,
"adv/mean_abs_reasoning": 0.4850810766220093,
"adv/mean_abs_step_conf": 0.7860969305038452,
"adv/ratio_final_to_reasoning": 1.522465720770037,
"adv/ratio_step_to_reasoning": 1.6205475092494634,
"adv/std_final_conf": 0.9335052967071533,
"adv/std_reasoning": 0.757537305355072,
"adv/std_step_conf": 0.9362009167671204,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 15.12109375,
"calib/ece": 0.17814516129032254,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.010933646506165573,
"calib/mean_conf": 0.6665322580645162,
"calib/mu_c": 0.6613740458015268,
"calib/mu_w": 0.6723076923076924,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15822580645161285,
"calib/std_conf": 0.0557361924871592,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.588147549811524,
"calib/step_q_c_n": 1857.0,
"calib/step_q_gap": -0.013004386633361786,
"calib/step_q_w": 0.6011519364448857,
"calib/step_q_w_n": 2014.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3000.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 914.36328125,
"completions/mean_terminated_length": 940.0682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.1056,
"grad_norm": 0.23129351437091827,
"kl": 0.0624847412109375,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.0659,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018017534166574478,
"mask/share_reasoning": 0.7806091904640198,
"mask/share_step_conf": 0.17402949929237366,
"num_tokens": 29493002.0,
"reward": 0.5717493295669556,
"reward_std": 0.27225303649902344,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7005148530006409,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.14689014852046967,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7555106282234192,
"adv/mean_abs_reasoning": 0.4237300455570221,
"adv/mean_abs_step_conf": 0.792742133140564,
"adv/ratio_final_to_reasoning": 1.7829998985090822,
"adv/ratio_step_to_reasoning": 1.870865994641589,
"adv/std_final_conf": 0.9318886399269104,
"adv/std_reasoning": 0.701408863067627,
"adv/std_step_conf": 0.9356249570846558,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.87109375,
"calib/ece": 0.05052,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0036363636363635488,
"calib/mean_conf": 0.65276,
"calib/mu_c": 0.6513636363636364,
"calib/mu_w": 0.6549999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04364,
"calib/std_conf": 0.050714715813065546,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5829321446260525,
"calib/step_q_c_n": 2019.0,
"calib/step_q_gap": -0.01775976137916946,
"calib/step_q_w": 0.6006919060052219,
"calib/step_q_w_n": 1532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2516.0,
"completions/max_terminated_length": 2516.0,
"completions/mean_length": 833.890625,
"completions/mean_terminated_length": 857.3333129882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.17867591977119446,
"kl": 0.0666046142578125,
"learning_rate": 2.805555555555556e-06,
"loss": -0.1227,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019194157794117928,
"mask/share_reasoning": 0.7794458270072937,
"mask/share_step_conf": 0.17401626706123352,
"num_tokens": 29813886.0,
"reward": 0.6280609965324402,
"reward_std": 0.23585641384124756,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7400511503219604,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.20044583082199097,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.7429351210594177,
"adv/mean_abs_reasoning": 0.4691013693809509,
"adv/mean_abs_step_conf": 0.7799124717712402,
"adv/ratio_final_to_reasoning": 1.5837411049126358,
"adv/ratio_step_to_reasoning": 1.6625670327938944,
"adv/std_final_conf": 0.9320369362831116,
"adv/std_reasoning": 0.7393046021461487,
"adv/std_step_conf": 0.9358084797859192,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.81640625,
"calib/ece": 0.07523809523809512,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003968253968253968,
"calib/gap": 0.0018114289437675124,
"calib/mean_conf": 0.6665079365079364,
"calib/mu_c": 0.667248322147651,
"calib/mu_w": 0.6654368932038834,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07523809523809512,
"calib/std_conf": 0.057173051987101216,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5900836042731074,
"calib/step_q_c_n": 2153.0,
"calib/step_q_gap": -0.017337127434209787,
"calib/step_q_w": 0.6074207317073171,
"calib/step_q_w_n": 1640.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 876.9765625,
"completions/mean_terminated_length": 883.8818969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.20067480206489563,
"kl": 0.0763702392578125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0548,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01860753819346428,
"mask/share_reasoning": 0.790267825126648,
"mask/share_step_conf": 0.18331214785575867,
"num_tokens": 30145384.0,
"reward": 0.6333275437355042,
"reward_std": 0.24875670671463013,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7385531067848206,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.2148207128047943,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.7533099055290222,
"adv/mean_abs_reasoning": 0.22623670101165771,
"adv/mean_abs_step_conf": 0.7874011993408203,
"adv/ratio_final_to_reasoning": 3.329742266221452,
"adv/ratio_step_to_reasoning": 3.4804308753611397,
"adv/std_final_conf": 0.9287867546081543,
"adv/std_reasoning": 0.5226889252662659,
"adv/std_step_conf": 0.9349228143692017,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.01953125,
"calib/ece": 0.17559055118110245,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.031882022471910076,
"calib/mean_conf": 0.6501574803149606,
"calib/mu_c": 0.6406179775280899,
"calib/mu_w": 0.6725,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06248031496062993,
"calib/std_conf": 0.061432816539549094,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5780414312617702,
"calib/step_q_c_n": 2124.0,
"calib/step_q_gap": -0.025589668820942713,
"calib/step_q_w": 0.6036311000827129,
"calib/step_q_w_n": 1209.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 713.87109375,
"completions/mean_terminated_length": 722.3359985351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.1088,
"grad_norm": 0.19545966386795044,
"kl": 0.086181640625,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0009,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02277192659676075,
"mask/share_reasoning": 0.775929868221283,
"mask/share_step_conf": 0.18957942724227905,
"num_tokens": 30434831.0,
"reward": 0.7263565063476562,
"reward_std": 0.15422692894935608,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7645875215530396,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.35062551498413086,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.7185190916061401,
"adv/mean_abs_reasoning": 0.3661821484565735,
"adv/mean_abs_step_conf": 0.7697737216949463,
"adv/ratio_final_to_reasoning": 1.9621903870372621,
"adv/ratio_step_to_reasoning": 2.102160700458711,
"adv/std_final_conf": 0.9287464022636414,
"adv/std_reasoning": 0.6611608266830444,
"adv/std_step_conf": 0.9357419610023499,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.0,
"calib/ece": 0.1333333333333334,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.007936507936507936,
"calib/gap": -0.03233678901653125,
"calib/mean_conf": 0.6640476190476191,
"calib/mu_c": 0.6530120481927711,
"calib/mu_w": 0.6853488372093024,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06932539682539682,
"calib/std_conf": 0.07228337721298855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5885881801125704,
"calib/step_q_c_n": 2132.0,
"calib/step_q_gap": -0.021466916306162376,
"calib/step_q_w": 0.6100550964187328,
"calib/step_q_w_n": 1452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2292.0,
"completions/max_terminated_length": 2292.0,
"completions/mean_length": 879.5703125,
"completions/mean_terminated_length": 893.5317993164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 297.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.18727880716323853,
"kl": 0.0642547607421875,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0503,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019160684198141098,
"mask/share_reasoning": 0.7917564511299133,
"mask/share_step_conf": 0.17345784604549408,
"num_tokens": 30764553.0,
"reward": 0.6658133268356323,
"reward_std": 0.18848654627799988,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7436000108718872,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.26146411895751953,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.7070366144180298,
"adv/mean_abs_reasoning": 0.3533375561237335,
"adv/mean_abs_step_conf": 0.7518788576126099,
"adv/ratio_final_to_reasoning": 2.0010231071231956,
"adv/ratio_step_to_reasoning": 2.1279335994199076,
"adv/std_final_conf": 0.915729820728302,
"adv/std_reasoning": 0.661066472530365,
"adv/std_step_conf": 0.9352713823318481,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.3046875,
"calib/ece": 0.083203125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": -0.0003262764632627757,
"calib/mean_conf": 0.653359375,
"calib/mu_c": 0.6532191780821918,
"calib/mu_w": 0.6535454545454545,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.083125,
"calib/std_conf": 0.05210082388609009,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5790608604407136,
"calib/step_q_c_n": 1906.0,
"calib/step_q_gap": -0.0037258062259530877,
"calib/step_q_w": 0.5827866666666667,
"calib/step_q_w_n": 1500.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1925.0,
"completions/max_terminated_length": 1925.0,
"completions/mean_length": 782.90625,
"completions/mean_terminated_length": 789.0708618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.20894844830036163,
"kl": 0.07109832763671875,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0184,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02064879611134529,
"mask/share_reasoning": 0.7865666747093201,
"mask/share_step_conf": 0.18497204780578613,
"num_tokens": 31071657.0,
"reward": 0.646728515625,
"reward_std": 0.18793892860412598,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7451726794242859,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.23422178626060486,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7167232632637024,
"adv/mean_abs_reasoning": 0.33493661880493164,
"adv/mean_abs_step_conf": 0.7485485076904297,
"adv/ratio_final_to_reasoning": 2.1398772872939427,
"adv/ratio_step_to_reasoning": 2.2348959942370086,
"adv/std_final_conf": 0.9292554259300232,
"adv/std_reasoning": 0.6611365079879761,
"adv/std_step_conf": 0.9348087906837463,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.68359375,
"calib/ece": 0.2034126984126984,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003968253968253968,
"calib/gap": -0.04890624999999982,
"calib/mean_conf": 0.6552380952380953,
"calib/mu_c": 0.64359375,
"calib/mu_w": 0.6924999999999998,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04837301587301586,
"calib/std_conf": 0.05699783184005442,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5769583161370201,
"calib/step_q_c_n": 2423.0,
"calib/step_q_gap": -0.03884723941853552,
"calib/step_q_w": 0.6158055555555556,
"calib/step_q_w_n": 1080.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2420.0,
"completions/max_terminated_length": 2420.0,
"completions/mean_length": 818.63671875,
"completions/mean_terminated_length": 828.3439331054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.112,
"grad_norm": 0.16072368621826172,
"kl": 0.06591796875,
"learning_rate": 2.666666666666667e-06,
"loss": -0.048,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019962280988693237,
"mask/share_reasoning": 0.7864009141921997,
"mask/share_step_conf": 0.18191802501678467,
"num_tokens": 31386988.0,
"reward": 0.8123526573181152,
"reward_std": 0.18710803985595703,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7739390134811401,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.5038912296295166,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.7355873584747314,
"adv/mean_abs_reasoning": 0.3430485725402832,
"adv/mean_abs_step_conf": 0.743608832359314,
"adv/ratio_final_to_reasoning": 2.14426590680057,
"adv/ratio_step_to_reasoning": 2.1676488167633874,
"adv/std_final_conf": 0.9301002025604248,
"adv/std_reasoning": 0.640177309513092,
"adv/std_step_conf": 0.9360626935958862,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.17578125,
"calib/ece": 0.039764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.008565789473684138,
"calib/mean_conf": 0.6529411764705882,
"calib/mu_c": 0.64975,
"calib/mu_w": 0.6583157894736842,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03262745098039218,
"calib/std_conf": 0.04668182321015065,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5782606541129832,
"calib/step_q_c_n": 2018.0,
"calib/step_q_gap": -0.009008718580743702,
"calib/step_q_w": 0.5872693726937269,
"calib/step_q_w_n": 1355.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1838.0,
"completions/max_terminated_length": 1838.0,
"completions/mean_length": 783.30859375,
"completions/mean_terminated_length": 789.4763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.1853887140750885,
"kl": 0.06472015380859375,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0049,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019807640463113785,
"mask/share_reasoning": 0.7897074222564697,
"mask/share_step_conf": 0.1826724410057068,
"num_tokens": 31692099.0,
"reward": 0.6519793272018433,
"reward_std": 0.19260109961032867,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7564437389373779,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.2232961654663086,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.7967756390571594,
"adv/mean_abs_reasoning": 0.45540252327919006,
"adv/mean_abs_step_conf": 0.7411437034606934,
"adv/ratio_final_to_reasoning": 1.749607431508864,
"adv/ratio_step_to_reasoning": 1.6274475119812328,
"adv/std_final_conf": 0.9317139983177185,
"adv/std_reasoning": 0.7013875842094421,
"adv/std_step_conf": 0.9358682632446289,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.51953125,
"calib/ece": 0.026601562499999974,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.011492768959435629,
"calib/mean_conf": 0.6569921875,
"calib/mu_c": 0.6606285714285713,
"calib/mu_w": 0.6491358024691357,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.0451639368187146,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5853536184210526,
"calib/step_q_c_n": 2432.0,
"calib/step_q_gap": 0.009056242327758146,
"calib/step_q_w": 0.5762973760932945,
"calib/step_q_w_n": 1029.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1792.0,
"completions/max_terminated_length": 1792.0,
"completions/mean_length": 778.984375,
"completions/mean_terminated_length": 785.1181030273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.1831437200307846,
"kl": 0.07171630859375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.004,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020345225930213928,
"mask/share_reasoning": 0.779441237449646,
"mask/share_step_conf": 0.19240108132362366,
"num_tokens": 31996135.0,
"reward": 0.7552859783172607,
"reward_std": 0.21182772517204285,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.785930871963501,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.38792234659194946,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.7637505531311035,
"adv/mean_abs_reasoning": 0.30685943365097046,
"adv/mean_abs_step_conf": 0.7783085107803345,
"adv/ratio_final_to_reasoning": 2.488926424858792,
"adv/ratio_step_to_reasoning": 2.536368204555842,
"adv/std_final_conf": 0.9282589554786682,
"adv/std_reasoning": 0.5959946513175964,
"adv/std_step_conf": 0.9353668689727783,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.3125,
"calib/ece": 0.1579446640316206,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.014256097560975567,
"calib/mean_conf": 0.6609486166007905,
"calib/mu_c": 0.6582439024390244,
"calib/mu_w": 0.6725,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.004308300395256917,
"calib/std_conf": 0.04583750005322809,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5833650793650794,
"calib/step_q_c_n": 2835.0,
"calib/step_q_gap": -0.02138763474831029,
"calib/step_q_w": 0.6047527141133897,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2196.0,
"completions/max_terminated_length": 2196.0,
"completions/mean_length": 815.5625,
"completions/mean_terminated_length": 828.5079956054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.1152,
"grad_norm": 0.1538919061422348,
"kl": 0.06288909912109375,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.0532,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01957569271326065,
"mask/share_reasoning": 0.7753883600234985,
"mask/share_step_conf": 0.18941092491149902,
"num_tokens": 32308151.0,
"reward": 0.7745775580406189,
"reward_std": 0.19146263599395752,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8079085946083069,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.383433997631073,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.7414380311965942,
"adv/mean_abs_reasoning": 0.33757272362709045,
"adv/mean_abs_step_conf": 0.7734533548355103,
"adv/ratio_final_to_reasoning": 2.196380155452504,
"adv/ratio_step_to_reasoning": 2.291219937810876,
"adv/std_final_conf": 0.9311279058456421,
"adv/std_reasoning": 0.6185241937637329,
"adv/std_step_conf": 0.9362562298774719,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.640625,
"calib/ece": 0.11004048582995961,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.016194331983805668,
"calib/gap": -0.017967011128775634,
"calib/mean_conf": 0.6606477732793522,
"calib/mu_c": 0.6525735294117647,
"calib/mu_w": 0.6705405405405404,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11004048582995961,
"calib/std_conf": 0.06774255733281026,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.577875150060024,
"calib/step_q_c_n": 1666.0,
"calib/step_q_gap": -0.048762698162838625,
"calib/step_q_w": 0.6266378482228626,
"calib/step_q_w_n": 2082.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2896.0,
"completions/max_terminated_length": 2896.0,
"completions/mean_length": 815.49609375,
"completions/mean_terminated_length": 838.421630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.1558358371257782,
"kl": 0.06879425048828125,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0197,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019493218511343002,
"mask/share_reasoning": 0.7717585563659668,
"mask/share_step_conf": 0.1814044862985611,
"num_tokens": 32621518.0,
"reward": 0.5735743641853333,
"reward_std": 0.20711404085159302,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7014141082763672,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.14651596546173096,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7351251840591431,
"adv/mean_abs_reasoning": 0.3662223219871521,
"adv/mean_abs_step_conf": 0.7691531181335449,
"adv/ratio_final_to_reasoning": 2.0073194339173375,
"adv/ratio_step_to_reasoning": 2.1002354907261185,
"adv/std_final_conf": 0.9158840179443359,
"adv/std_reasoning": 0.6611154079437256,
"adv/std_step_conf": 0.9360529184341431,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.3203125,
"calib/ece": 0.11093750000000009,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.006303030303030366,
"calib/mean_conf": 0.650625,
"calib/mu_c": 0.6489999999999999,
"calib/mu_w": 0.6553030303030303,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.009687500000000012,
"calib/std_conf": 0.050003906097424035,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5814681107099879,
"calib/step_q_c_n": 2493.0,
"calib/step_q_gap": -0.00413712375020836,
"calib/step_q_w": 0.5856052344601963,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1584.0,
"completions/max_terminated_length": 1584.0,
"completions/mean_length": 742.68359375,
"completions/mean_terminated_length": 748.531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.1851894110441208,
"kl": 0.07032012939453125,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0597,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.021798258647322655,
"mask/share_reasoning": 0.7785544395446777,
"mask/share_step_conf": 0.19183480739593506,
"num_tokens": 32916565.0,
"reward": 0.7617905139923096,
"reward_std": 0.21278932690620422,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7953585982322693,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.37978485226631165,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.7544980049133301,
"adv/mean_abs_reasoning": 0.3003673851490021,
"adv/mean_abs_step_conf": 0.751507043838501,
"adv/ratio_final_to_reasoning": 2.5119172127794407,
"adv/ratio_step_to_reasoning": 2.501959536870835,
"adv/std_final_conf": 0.9297202229499817,
"adv/std_reasoning": 0.5959699749946594,
"adv/std_step_conf": 0.935775101184845,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.90234375,
"calib/ece": 0.19356862745098033,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.040253217449440104,
"calib/mean_conf": 0.6593725490196077,
"calib/mu_c": 0.6495854922279793,
"calib/mu_w": 0.6898387096774194,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04803921568627452,
"calib/std_conf": 0.05574098389568812,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5818112449799197,
"calib/step_q_c_n": 2490.0,
"calib/step_q_gap": -0.02305311423429912,
"calib/step_q_w": 0.6048643592142188,
"calib/step_q_w_n": 1069.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 826.3203125,
"completions/mean_terminated_length": 829.5608520507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.1184,
"grad_norm": 0.17247696220874786,
"kl": 0.060150146484375,
"learning_rate": 2.5e-06,
"loss": 0.0448,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020120007917284966,
"mask/share_reasoning": 0.7870587110519409,
"mask/share_step_conf": 0.18891501426696777,
"num_tokens": 33235511.0,
"reward": 0.7798572182655334,
"reward_std": 0.18497300148010254,
"rewards/accuracy_reward_step": 0.75390625,
"rewards/final_brier_reward_step": 0.7854719161987305,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.42424261569976807,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.748249888420105,
"adv/mean_abs_reasoning": 0.37312665581703186,
"adv/mean_abs_step_conf": 0.7734876871109009,
"adv/ratio_final_to_reasoning": 2.005350935814729,
"adv/ratio_step_to_reasoning": 2.072989627120588,
"adv/std_final_conf": 0.9300276041030884,
"adv/std_reasoning": 0.6611367464065552,
"adv/std_step_conf": 0.935664176940918,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.75,
"calib/ece": 0.060157480314960605,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.013223016683690858,
"calib/mean_conf": 0.6651181102362205,
"calib/mu_c": 0.6604848484848485,
"calib/mu_w": 0.6737078651685393,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.037834645669291324,
"calib/std_conf": 0.04989542042217861,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5863486005089059,
"calib/step_q_c_n": 2358.0,
"calib/step_q_gap": -0.00951176620477534,
"calib/step_q_w": 0.5958603667136813,
"calib/step_q_w_n": 1418.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2187.0,
"completions/max_terminated_length": 2187.0,
"completions/mean_length": 875.0078125,
"completions/mean_terminated_length": 881.8976440429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.16523876786231995,
"kl": 0.0580902099609375,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017950210720300674,
"mask/share_reasoning": 0.7930775284767151,
"mask/share_step_conf": 0.18115977942943573,
"num_tokens": 33567433.0,
"reward": 0.714263379573822,
"reward_std": 0.2062770277261734,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7576664686203003,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.34351664781570435,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.7786873579025269,
"adv/mean_abs_reasoning": 0.37065157294273376,
"adv/mean_abs_step_conf": 0.753793478012085,
"adv/ratio_final_to_reasoning": 2.100860794196158,
"adv/ratio_step_to_reasoning": 2.0336983114018707,
"adv/std_final_conf": 0.9285604953765869,
"adv/std_reasoning": 0.640232264995575,
"adv/std_step_conf": 0.9355056881904602,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.9765625,
"calib/ece": 0.05321568627450977,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.016989413025998434,
"calib/mean_conf": 0.6503921568627451,
"calib/mu_c": 0.6443292682926829,
"calib/mu_w": 0.6613186813186813,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.030235294117647044,
"calib/std_conf": 0.054267340566962446,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5788158554942299,
"calib/step_q_c_n": 1993.0,
"calib/step_q_gap": -0.012749231036996655,
"calib/step_q_w": 0.5915650865312265,
"calib/step_q_w_n": 1329.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2239.0,
"completions/max_terminated_length": 2239.0,
"completions/mean_length": 730.3125,
"completions/mean_terminated_length": 733.176513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 308.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.23360399901866913,
"kl": 0.07427215576171875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0025,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022047486156225204,
"mask/share_reasoning": 0.7772487998008728,
"mask/share_step_conf": 0.19679749011993408,
"num_tokens": 33859593.0,
"reward": 0.6659376621246338,
"reward_std": 0.18285523355007172,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7567245960235596,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.24780693650245667,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.7306428551673889,
"adv/mean_abs_reasoning": 0.27716708183288574,
"adv/mean_abs_step_conf": 0.7564301490783691,
"adv/ratio_final_to_reasoning": 2.636109780193596,
"adv/ratio_step_to_reasoning": 2.7291485845871435,
"adv/std_final_conf": 0.9272521138191223,
"adv/std_reasoning": 0.5726190209388733,
"adv/std_step_conf": 0.9354404211044312,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.015625,
"calib/ece": 0.18437007874015748,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.036960451977401054,
"calib/mean_conf": 0.649251968503937,
"calib/mu_c": 0.6406666666666668,
"calib/mu_w": 0.6776271186440679,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0329527559055118,
"calib/std_conf": 0.050554323865316636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5732784755305326,
"calib/step_q_c_n": 2309.0,
"calib/step_q_gap": -0.03767949123388581,
"calib/step_q_w": 0.6109579667644184,
"calib/step_q_w_n": 1023.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 745.19140625,
"completions/mean_terminated_length": 751.05908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.1216,
"grad_norm": 0.15612445771694183,
"kl": 0.0630950927734375,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0114,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021982520818710327,
"mask/share_reasoning": 0.775760293006897,
"mask/share_step_conf": 0.1944447159767151,
"num_tokens": 34155386.0,
"reward": 0.7840208411216736,
"reward_std": 0.17199936509132385,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7857136726379395,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.43154671788215637,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.7011224031448364,
"adv/mean_abs_reasoning": 0.2878670394420624,
"adv/mean_abs_step_conf": 0.7257644534111023,
"adv/ratio_final_to_reasoning": 2.435577218231502,
"adv/ratio_step_to_reasoning": 2.5211794126127227,
"adv/std_final_conf": 0.9287891983985901,
"adv/std_reasoning": 0.618270218372345,
"adv/std_step_conf": 0.9357025623321533,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.66015625,
"calib/ece": 0.012755905511811032,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0004261003138983366,
"calib/mean_conf": 0.6466141732283465,
"calib/mu_c": 0.6467701863354037,
"calib/mu_w": 0.6463440860215054,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.012755905511811032,
"calib/std_conf": 0.03710424710798357,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5752891744933267,
"calib/step_q_c_n": 2023.0,
"calib/step_q_gap": -0.006090135851500866,
"calib/step_q_w": 0.5813793103448276,
"calib/step_q_w_n": 1218.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2584.0,
"completions/max_terminated_length": 2584.0,
"completions/mean_length": 743.84765625,
"completions/mean_terminated_length": 749.7047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.20659328997135162,
"kl": 0.06292724609375,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0252,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020182587206363678,
"mask/share_reasoning": 0.7855570316314697,
"mask/share_step_conf": 0.1864479035139084,
"num_tokens": 34451075.0,
"reward": 0.657963752746582,
"reward_std": 0.1874353587627411,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7605875134468079,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.2311212718486786,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7466577291488647,
"adv/mean_abs_reasoning": 0.3631676137447357,
"adv/mean_abs_step_conf": 0.7654522657394409,
"adv/ratio_final_to_reasoning": 2.055959014213414,
"adv/ratio_step_to_reasoning": 2.1077107009807987,
"adv/std_final_conf": 0.9302747249603271,
"adv/std_reasoning": 0.640287458896637,
"adv/std_step_conf": 0.935708224773407,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.53515625,
"calib/ece": 0.08755905511811027,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.003269878574226337,
"calib/mean_conf": 0.6592125984251969,
"calib/mu_c": 0.6583243243243243,
"calib/mu_w": 0.6615942028985506,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0092125984251969,
"calib/std_conf": 0.05507484240991102,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5850990699555196,
"calib/step_q_c_n": 2473.0,
"calib/step_q_gap": -0.004890849399319053,
"calib/step_q_w": 0.5899899193548387,
"calib/step_q_w_n": 992.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1781.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 839.34765625,
"completions/mean_terminated_length": 849.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.1914065182209015,
"kl": 0.0562591552734375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0423,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01951763778924942,
"mask/share_reasoning": 0.7906308174133301,
"mask/share_step_conf": 0.1781328320503235,
"num_tokens": 34770468.0,
"reward": 0.7317217588424683,
"reward_std": 0.22887194156646729,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7868398427963257,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.33363497257232666,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7924519777297974,
"adv/mean_abs_reasoning": 0.33524632453918457,
"adv/mean_abs_step_conf": 0.7852796316146851,
"adv/ratio_final_to_reasoning": 2.363790203573651,
"adv/ratio_step_to_reasoning": 2.3423959463063384,
"adv/std_final_conf": 0.9294283390045166,
"adv/std_reasoning": 0.5961469411849976,
"adv/std_step_conf": 0.935607373714447,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.48828125,
"calib/ece": 0.07921568627450978,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": -0.018871558474362282,
"calib/mean_conf": 0.6596078431372548,
"calib/mu_c": 0.6516891891891892,
"calib/mu_w": 0.6705607476635514,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07921568627450978,
"calib/std_conf": 0.06135594588422498,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5792249730893434,
"calib/step_q_c_n": 1858.0,
"calib/step_q_gap": -0.018856531612850946,
"calib/step_q_w": 0.5980815047021943,
"calib/step_q_w_n": 1595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1996.0,
"completions/max_terminated_length": 1996.0,
"completions/mean_length": 792.65625,
"completions/mean_terminated_length": 798.8976440429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.1248,
"grad_norm": 0.1906462460756302,
"kl": 0.05759429931640625,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0051,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01992100104689598,
"mask/share_reasoning": 0.7847388982772827,
"mask/share_step_conf": 0.18752756714820862,
"num_tokens": 35079988.0,
"reward": 0.6975091695785522,
"reward_std": 0.19168932735919952,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7343515753746033,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3458230495452881,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.751489520072937,
"adv/mean_abs_reasoning": 0.23559094965457916,
"adv/mean_abs_step_conf": 0.7602692246437073,
"adv/ratio_final_to_reasoning": 3.1898064045956036,
"adv/ratio_step_to_reasoning": 3.227073135697299,
"adv/std_final_conf": 0.9276825189590454,
"adv/std_reasoning": 0.5226951241493225,
"adv/std_step_conf": 0.9354988932609558,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.7734375,
"calib/ece": 0.19570312500000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0078125,
"calib/gap": -0.0464843909406244,
"calib/mean_conf": 0.6641406249999999,
"calib/mu_c": 0.6483431952662722,
"calib/mu_w": 0.6948275862068966,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09984375000000002,
"calib/std_conf": 0.06216821112601982,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5820426977195535,
"calib/step_q_c_n": 2061.0,
"calib/step_q_gap": -0.02276958896986636,
"calib/step_q_w": 0.6048122866894199,
"calib/step_q_w_n": 1465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2131.0,
"completions/max_terminated_length": 2131.0,
"completions/mean_length": 854.04296875,
"completions/mean_terminated_length": 860.7677001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.1517084538936615,
"kl": 0.05948638916015625,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0313,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018919751048088074,
"mask/share_reasoning": 0.7929970026016235,
"mask/share_step_conf": 0.1802707016468048,
"num_tokens": 35402631.0,
"reward": 0.7126184105873108,
"reward_std": 0.18107867240905762,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7509117126464844,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.34229379892349243,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.7432209849357605,
"adv/mean_abs_reasoning": 0.3506610691547394,
"adv/mean_abs_step_conf": 0.7597742080688477,
"adv/ratio_final_to_reasoning": 2.119485310209308,
"adv/ratio_step_to_reasoning": 2.1666910726653126,
"adv/std_final_conf": 0.9295665621757507,
"adv/std_reasoning": 0.6611495614051819,
"adv/std_step_conf": 0.9353212118148804,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.15625,
"calib/ece": 0.15254901960784312,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.01568627450980392,
"calib/gap": -0.03208333333333324,
"calib/mean_conf": 0.6621960784313726,
"calib/mu_c": 0.65125,
"calib/mu_w": 0.6833333333333332,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0779607843137255,
"calib/std_conf": 0.06584504078809585,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5848651162790698,
"calib/step_q_c_n": 2150.0,
"calib/step_q_gap": -0.03094899498280268,
"calib/step_q_w": 0.6158141112618725,
"calib/step_q_w_n": 1474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2329.0,
"completions/max_terminated_length": 2329.0,
"completions/mean_length": 848.875,
"completions/mean_terminated_length": 855.55908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 267.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.16920378804206848,
"kl": 0.05426788330078125,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0294,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019031208008527756,
"mask/share_reasoning": 0.7919583320617676,
"mask/share_step_conf": 0.18119795620441437,
"num_tokens": 35725007.0,
"reward": 0.7125355005264282,
"reward_std": 0.19247612357139587,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7510000467300415,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.3443836271762848,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.7433023452758789,
"adv/mean_abs_reasoning": 0.2874475121498108,
"adv/mean_abs_step_conf": 0.766741156578064,
"adv/ratio_final_to_reasoning": 2.5858715551814813,
"adv/ratio_step_to_reasoning": 2.667412742046822,
"adv/std_final_conf": 0.926930844783783,
"adv/std_reasoning": 0.5958617925643921,
"adv/std_step_conf": 0.9352979063987732,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.84375,
"calib/ece": 0.14555118110236218,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.021628289473684337,
"calib/mean_conf": 0.6541338582677165,
"calib/mu_c": 0.6486842105263158,
"calib/mu_w": 0.6703125000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.025826771653543287,
"calib/std_conf": 0.043406132264746156,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5776756066411239,
"calib/step_q_c_n": 2349.0,
"calib/step_q_gap": -0.009310548843434185,
"calib/step_q_w": 0.5869861554845581,
"calib/step_q_w_n": 939.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2198.0,
"completions/max_terminated_length": 2198.0,
"completions/mean_length": 796.8125,
"completions/mean_terminated_length": 803.0866088867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 308.0,
"epoch": 0.128,
"grad_norm": 0.16395922005176544,
"kl": 0.0589599609375,
"learning_rate": 2.25e-06,
"loss": -0.0356,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019536158069968224,
"mask/share_reasoning": 0.7872902154922485,
"mask/share_step_conf": 0.1853610873222351,
"num_tokens": 36035679.0,
"reward": 0.7953190207481384,
"reward_std": 0.16985073685646057,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7864730358123779,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.45728999376296997,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7642772793769836,
"adv/mean_abs_reasoning": 0.3697505593299866,
"adv/mean_abs_step_conf": 0.741147518157959,
"adv/ratio_final_to_reasoning": 2.0670077707574173,
"adv/ratio_step_to_reasoning": 2.0044527302432464,
"adv/std_final_conf": 0.9294301271438599,
"adv/std_reasoning": 0.6402899622917175,
"adv/std_step_conf": 0.9361931085586548,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.98828125,
"calib/ece": 0.13168627450980389,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.017973352033660683,
"calib/mean_conf": 0.6667450980392158,
"calib/mu_c": 0.6618817204301075,
"calib/mu_w": 0.6798550724637682,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03450980392156862,
"calib/std_conf": 0.05298162579658359,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5841450777202072,
"calib/step_q_c_n": 2509.0,
"calib/step_q_gap": -0.017767235712628615,
"calib/step_q_w": 0.6019123134328358,
"calib/step_q_w_n": 1072.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1936.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 884.28125,
"completions/mean_terminated_length": 891.2440795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.18738946318626404,
"kl": 0.05516815185546875,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0092,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017908845096826553,
"mask/share_reasoning": 0.7953507900238037,
"mask/share_step_conf": 0.17892783880233765,
"num_tokens": 36367111.0,
"reward": 0.7506467700004578,
"reward_std": 0.22363224625587463,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.7857195138931274,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.37104272842407227,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.7539645433425903,
"adv/mean_abs_reasoning": 0.3049893379211426,
"adv/mean_abs_step_conf": 0.7673645615577698,
"adv/ratio_final_to_reasoning": 2.4721013150221465,
"adv/ratio_step_to_reasoning": 2.516037336872996,
"adv/std_final_conf": 0.9275853037834167,
"adv/std_reasoning": 0.5960739850997925,
"adv/std_step_conf": 0.9354673027992249,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.76171875,
"calib/ece": 0.09349397590361447,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007713487629688598,
"calib/mean_conf": 0.6608835341365462,
"calib/mu_c": 0.6587150837988827,
"calib/mu_w": 0.6664285714285713,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01775100401606426,
"calib/std_conf": 0.044143141684649084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.579853268119164,
"calib/step_q_c_n": 2249.0,
"calib/step_q_gap": -0.028584722461683643,
"calib/step_q_w": 0.6084379905808477,
"calib/step_q_w_n": 1274.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2098.0,
"completions/max_terminated_length": 2098.0,
"completions/mean_length": 832.37890625,
"completions/mean_terminated_length": 855.7791137695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 417.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.20258116722106934,
"kl": 0.0553741455078125,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.152,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01726049929857254,
"mask/share_reasoning": 0.7798194289207458,
"mask/share_step_conf": 0.1755763292312622,
"num_tokens": 36687544.0,
"reward": 0.7311630249023438,
"reward_std": 0.16484180092811584,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7678898572921753,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.3600612282752991,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7517590522766113,
"adv/mean_abs_reasoning": 0.42011213302612305,
"adv/mean_abs_step_conf": 0.7539490461349487,
"adv/ratio_final_to_reasoning": 1.7894247587223722,
"adv/ratio_step_to_reasoning": 1.7946376380614253,
"adv/std_final_conf": 0.9308037757873535,
"adv/std_reasoning": 0.701386034488678,
"adv/std_step_conf": 0.9355675578117371,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.4375,
"calib/ece": 0.03579999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.012561142774691447,
"calib/mean_conf": 0.66932,
"calib/mu_c": 0.6651497005988025,
"calib/mu_w": 0.677710843373494,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01856000000000001,
"calib/std_conf": 0.05720085314049084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5876273400087071,
"calib/step_q_c_n": 2297.0,
"calib/step_q_gap": -0.01649703454454532,
"calib/step_q_w": 0.6041243745532524,
"calib/step_q_w_n": 1399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2505.0,
"completions/max_terminated_length": 2505.0,
"completions/mean_length": 932.6171875,
"completions/mean_terminated_length": 955.0000610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 372.0,
"epoch": 0.1312,
"grad_norm": 0.1839117556810379,
"kl": 0.0541534423828125,
"learning_rate": 2.166666666666667e-06,
"loss": -0.1264,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.016855834051966667,
"mask/share_reasoning": 0.7892988920211792,
"mask/share_step_conf": 0.17040777206420898,
"num_tokens": 37031582.0,
"reward": 0.7302656173706055,
"reward_std": 0.21496935188770294,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7513464689254761,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.3834035396575928,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.7521032094955444,
"adv/mean_abs_reasoning": 0.32858970761299133,
"adv/mean_abs_step_conf": 0.756147563457489,
"adv/ratio_final_to_reasoning": 2.288882433229959,
"adv/ratio_step_to_reasoning": 2.301190651863234,
"adv/std_final_conf": 0.9297972321510315,
"adv/std_reasoning": 0.6184530854225159,
"adv/std_step_conf": 0.9357353448867798,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.43359375,
"calib/ece": 0.07372549019607842,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.003072407045009773,
"calib/mean_conf": 0.6594509803921569,
"calib/mu_c": 0.6585714285714286,
"calib/mu_w": 0.6616438356164384,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.009725490196078435,
"calib/std_conf": 0.04873792006210367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5836648558295028,
"calib/step_q_c_n": 2393.0,
"calib/step_q_gap": -0.004709905164761108,
"calib/step_q_w": 0.5883747609942639,
"calib/step_q_w_n": 1046.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 884.359375,
"completions/mean_terminated_length": 887.8275146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 319.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.18207412958145142,
"kl": 0.050323486328125,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0203,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017641527578234673,
"mask/share_reasoning": 0.8057651519775391,
"mask/share_step_conf": 0.1726871132850647,
"num_tokens": 37364794.0,
"reward": 0.7761149406433105,
"reward_std": 0.20994237065315247,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.786019504070282,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.42480406165122986,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7799336910247803,
"adv/mean_abs_reasoning": 0.4838736057281494,
"adv/mean_abs_step_conf": 0.7964974045753479,
"adv/ratio_final_to_reasoning": 1.611854173883094,
"adv/ratio_step_to_reasoning": 1.646085661929734,
"adv/std_final_conf": 0.9312655329704285,
"adv/std_reasoning": 0.7206544280052185,
"adv/std_step_conf": 0.9362965822219849,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.15625,
"calib/ece": 0.05118110236220468,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.004841906559697962,
"calib/mean_conf": 0.6553543307086616,
"calib/mu_c": 0.6536196319018406,
"calib/mu_w": 0.6584615384615385,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03240157480314956,
"calib/std_conf": 0.051164321900868295,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5798667301285102,
"calib/step_q_c_n": 2101.0,
"calib/step_q_gap": -0.01134873948474946,
"calib/step_q_w": 0.5912154696132597,
"calib/step_q_w_n": 1267.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2128.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 860.84375,
"completions/mean_terminated_length": 874.5079956054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.1974727362394333,
"kl": 0.05866241455078125,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.024,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01837814599275589,
"mask/share_reasoning": 0.7949548959732056,
"mask/share_step_conf": 0.17104199528694153,
"num_tokens": 37689978.0,
"reward": 0.6941776275634766,
"reward_std": 0.24238252639770508,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.759081244468689,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.3034926950931549,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.7817037105560303,
"adv/mean_abs_reasoning": 0.3521580100059509,
"adv/mean_abs_step_conf": 0.7613109946250916,
"adv/ratio_final_to_reasoning": 2.219752748326868,
"adv/ratio_step_to_reasoning": 2.1618448906280068,
"adv/std_final_conf": 0.9274759292602539,
"adv/std_reasoning": 0.6187031865119934,
"adv/std_step_conf": 0.9351398348808289,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.015625,
"calib/ece": 0.15972222222222232,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03895386614684848,
"calib/mean_conf": 0.6592460317460317,
"calib/mu_c": 0.6467251461988305,
"calib/mu_w": 0.685679012345679,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0701984126984127,
"calib/std_conf": 0.046766761108402846,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5762054604349838,
"calib/step_q_c_n": 2161.0,
"calib/step_q_gap": -0.031425934099003516,
"calib/step_q_w": 0.6076313945339873,
"calib/step_q_w_n": 1427.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2391.0,
"completions/max_terminated_length": 2391.0,
"completions/mean_length": 861.546875,
"completions/mean_terminated_length": 875.2222900390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.1344,
"grad_norm": 0.17764291167259216,
"kl": 0.053813934326171875,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.073,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018441149964928627,
"mask/share_reasoning": 0.7864916920661926,
"mask/share_step_conf": 0.1794421523809433,
"num_tokens": 38015998.0,
"reward": 0.7014974355697632,
"reward_std": 0.21991148591041565,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7504230737686157,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.32210302352905273,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.7877411842346191,
"adv/mean_abs_reasoning": 0.3967254161834717,
"adv/mean_abs_step_conf": 0.7814566493034363,
"adv/ratio_final_to_reasoning": 1.985608060639897,
"adv/ratio_step_to_reasoning": 1.969767041449242,
"adv/std_final_conf": 0.9294294714927673,
"adv/std_reasoning": 0.6613271832466125,
"adv/std_step_conf": 0.9356931447982788,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 14.62109375,
"calib/ece": 0.040963855421686735,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.014924885241340835,
"calib/mean_conf": 0.6660240963855422,
"calib/mu_c": 0.6605696202531646,
"calib/mu_w": 0.6754945054945054,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03622489959839359,
"calib/std_conf": 0.0469282908233953,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5867464114832537,
"calib/step_q_c_n": 2090.0,
"calib/step_q_gap": -0.022769619974701505,
"calib/step_q_w": 0.6095160314579552,
"calib/step_q_w_n": 1653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 843.4765625,
"completions/mean_terminated_length": 863.7200317382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.20979590713977814,
"kl": 0.06250762939453125,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0176,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018223129212856293,
"mask/share_reasoning": 0.7735735774040222,
"mask/share_step_conf": 0.18476581573486328,
"num_tokens": 38335600.0,
"reward": 0.6690424680709839,
"reward_std": 0.22137640416622162,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7372585535049438,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.28285762667655945,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7571334838867188,
"adv/mean_abs_reasoning": 0.4741972088813782,
"adv/mean_abs_step_conf": 0.768513560295105,
"adv/ratio_final_to_reasoning": 1.5966637291534922,
"adv/ratio_step_to_reasoning": 1.6206623444874617,
"adv/std_final_conf": 0.9315160512924194,
"adv/std_reasoning": 0.7392508387565613,
"adv/std_step_conf": 0.9360662698745728,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.11328125,
"calib/ece": 0.10455999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.008,
"calib/gap": -0.026448308202871473,
"calib/mean_conf": 0.6692,
"calib/mu_c": 0.6604191616766466,
"calib/mu_w": 0.6868674698795181,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05288,
"calib/std_conf": 0.061788024729716035,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5889560439560441,
"calib/step_q_c_n": 2184.0,
"calib/step_q_gap": -0.013213305239197393,
"calib/step_q_w": 0.6021693491952415,
"calib/step_q_w_n": 1429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2440.0,
"completions/max_terminated_length": 2440.0,
"completions/mean_length": 896.26171875,
"completions/mean_terminated_length": 921.4578247070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.20551764965057373,
"kl": 0.05535888671875,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0794,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.017915409058332443,
"mask/share_reasoning": 0.778226375579834,
"mask/share_step_conf": 0.17651450634002686,
"num_tokens": 38671707.0,
"reward": 0.6882283091545105,
"reward_std": 0.2571631073951721,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7447984218597412,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.30587688088417053,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.7544215321540833,
"adv/mean_abs_reasoning": 0.33456581830978394,
"adv/mean_abs_step_conf": 0.7508994340896606,
"adv/ratio_final_to_reasoning": 2.254927105122087,
"adv/ratio_step_to_reasoning": 2.244399735403877,
"adv/std_final_conf": 0.9279298186302185,
"adv/std_reasoning": 0.6184731721878052,
"adv/std_step_conf": 0.9357430338859558,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.95703125,
"calib/ece": 0.060222656250000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": -0.0004651147143914791,
"calib/mean_conf": 0.6550898437499999,
"calib/mu_c": 0.6549281437125748,
"calib/mu_w": 0.6553932584269663,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.031484374999999995,
"calib/std_conf": 0.06971137297350115,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5895831784386617,
"calib/step_q_c_n": 2152.0,
"calib/step_q_gap": 0.0024415475373741335,
"calib/step_q_w": 0.5871416309012876,
"calib/step_q_w_n": 1165.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2123.0,
"completions/max_terminated_length": 2123.0,
"completions/mean_length": 811.8515625,
"completions/mean_terminated_length": 818.2440795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 264.0,
"epoch": 0.1376,
"grad_norm": 0.19056949019432068,
"kl": 0.05794525146484375,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.057,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020177476108074188,
"mask/share_reasoning": 0.7892353534698486,
"mask/share_step_conf": 0.1827746033668518,
"num_tokens": 38981925.0,
"reward": 0.734876275062561,
"reward_std": 0.20278804004192352,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7681304216384888,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.37115341424942017,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.7722935676574707,
"adv/mean_abs_reasoning": 0.13021612167358398,
"adv/mean_abs_step_conf": 0.7471880912780762,
"adv/ratio_final_to_reasoning": 5.930859848470978,
"adv/ratio_step_to_reasoning": 5.738061322015651,
"adv/std_final_conf": 0.9245293736457825,
"adv/std_reasoning": 0.36968645453453064,
"adv/std_step_conf": 0.9349712133407593,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.92578125,
"calib/ece": 0.16717647058823515,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03459047764849976,
"calib/mean_conf": 0.6572941176470589,
"calib/mu_c": 0.6476630434782609,
"calib/mu_w": 0.6822535211267606,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05145098039215685,
"calib/std_conf": 0.044652599826664925,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5745742205677059,
"calib/step_q_c_n": 2149.0,
"calib/step_q_gap": -0.02470164150125953,
"calib/step_q_w": 0.5992758620689654,
"calib/step_q_w_n": 1160.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2409.0,
"completions/max_terminated_length": 2409.0,
"completions/mean_length": 777.61328125,
"completions/mean_terminated_length": 783.7362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.1187925636768341,
"kl": 0.05173492431640625,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.0046,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020093917846679688,
"mask/share_reasoning": 0.7792434096336365,
"mask/share_step_conf": 0.19285017251968384,
"num_tokens": 39286282.0,
"reward": 0.7898828387260437,
"reward_std": 0.12785649299621582,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7760253548622131,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.46077150106430054,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.7504079937934875,
"adv/mean_abs_reasoning": 0.3435615301132202,
"adv/mean_abs_step_conf": 0.7561184167861938,
"adv/ratio_final_to_reasoning": 2.1842026188036585,
"adv/ratio_step_to_reasoning": 2.2008238714532915,
"adv/std_final_conf": 0.9309230446815491,
"adv/std_reasoning": 0.6185750365257263,
"adv/std_step_conf": 0.9353523850440979,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.64453125,
"calib/ece": 0.219763779527559,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.019634054325955552,
"calib/mean_conf": 0.6607086614173228,
"calib/mu_c": 0.6497321428571429,
"calib/mu_w": 0.6693661971830984,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.219763779527559,
"calib/std_conf": 0.05030898390931509,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5784447572132302,
"calib/step_q_c_n": 1421.0,
"calib/step_q_gap": -0.015160455141982165,
"calib/step_q_w": 0.5936052123552124,
"calib/step_q_w_n": 2072.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2077.0,
"completions/max_terminated_length": 2077.0,
"completions/mean_length": 814.8046875,
"completions/mean_terminated_length": 821.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.1584021896123886,
"kl": 0.0573883056640625,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0395,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0187082439661026,
"mask/share_reasoning": 0.7880691885948181,
"mask/share_step_conf": 0.1854100525379181,
"num_tokens": 39601080.0,
"reward": 0.5554240942001343,
"reward_std": 0.20415136218070984,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6875663995742798,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.13734431564807892,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.72148597240448,
"adv/mean_abs_reasoning": 0.3808108866214752,
"adv/mean_abs_step_conf": 0.7539358139038086,
"adv/ratio_final_to_reasoning": 1.8946043764805356,
"adv/ratio_step_to_reasoning": 1.9798168602601383,
"adv/std_final_conf": 0.9284760355949402,
"adv/std_reasoning": 0.6814852356910706,
"adv/std_step_conf": 0.9356449246406555,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.1640625,
"calib/ece": 0.14043137254901958,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0017073170731704002,
"calib/mean_conf": 0.6653725490196077,
"calib/mu_c": 0.6657073170731705,
"calib/mu_w": 0.6640000000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0009411764705882361,
"calib/std_conf": 0.051317527787378664,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5865436746987952,
"calib/step_q_c_n": 2656.0,
"calib/step_q_gap": -0.0016776138166110233,
"calib/step_q_w": 0.5882212885154062,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1956.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 823.26171875,
"completions/mean_terminated_length": 829.7440795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.1408,
"grad_norm": 0.22911450266838074,
"kl": 0.0549163818359375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0352,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019517943263053894,
"mask/share_reasoning": 0.7815057039260864,
"mask/share_step_conf": 0.1911638379096985,
"num_tokens": 39917427.0,
"reward": 0.798399806022644,
"reward_std": 0.22343795001506805,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8178699016571045,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.4195547103881836,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7569185495376587,
"adv/mean_abs_reasoning": 0.38095924258232117,
"adv/mean_abs_step_conf": 0.7612114548683167,
"adv/ratio_final_to_reasoning": 1.9868754053764603,
"adv/ratio_step_to_reasoning": 1.998144078900585,
"adv/std_final_conf": 0.9316318035125732,
"adv/std_reasoning": 0.6611606478691101,
"adv/std_step_conf": 0.9360270500183105,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 14.84765625,
"calib/ece": 0.09964843749999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": -0.004250904245193254,
"calib/mean_conf": 0.6838671875000001,
"calib/mu_c": 0.682156862745098,
"calib/mu_w": 0.6864077669902913,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0929296875,
"calib/std_conf": 0.057915330317972316,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5956744604316547,
"calib/step_q_c_n": 2224.0,
"calib/step_q_gap": -0.002511969498592559,
"calib/step_q_w": 0.5981864299302473,
"calib/step_q_w_n": 1577.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 996.5234375,
"completions/mean_terminated_length": 1004.3700561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.1763768196105957,
"kl": 0.04608917236328125,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0136,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.015577636659145355,
"mask/share_reasoning": 0.8027868270874023,
"mask/share_step_conf": 0.17382307350635529,
"num_tokens": 40278881.0,
"reward": 0.6586554050445557,
"reward_std": 0.2522858679294586,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7467058897018433,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.2510736882686615,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7732973098754883,
"adv/mean_abs_reasoning": 0.3910679221153259,
"adv/mean_abs_step_conf": 0.7673285007476807,
"adv/ratio_final_to_reasoning": 1.977398978910479,
"adv/ratio_step_to_reasoning": 1.9621361337875098,
"adv/std_final_conf": 0.9308413863182068,
"adv/std_reasoning": 0.6612193584442139,
"adv/std_step_conf": 0.9361827969551086,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 15.3203125,
"calib/ece": 0.08171874999999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.009156626506024002,
"calib/mean_conf": 0.6840625,
"calib/mu_c": 0.680843373493976,
"calib/mu_w": 0.69,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05867187499999997,
"calib/std_conf": 0.060135751377612304,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.599133252328878,
"calib/step_q_c_n": 2469.0,
"calib/step_q_gap": 0.0017210018127046256,
"calib/step_q_w": 0.5974122505161734,
"calib/step_q_w_n": 1453.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2305.0,
"completions/max_terminated_length": 2305.0,
"completions/mean_length": 985.46875,
"completions/mean_terminated_length": 993.2283325195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.21439006924629211,
"kl": 0.04900360107421875,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.009,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.016259362921118736,
"mask/share_reasoning": 0.796444833278656,
"mask/share_step_conf": 0.1794833391904831,
"num_tokens": 40640113.0,
"reward": 0.6827579140663147,
"reward_std": 0.2516036629676819,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.762973427772522,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.27285486459732056,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7396765351295471,
"adv/mean_abs_reasoning": 0.3181406259536743,
"adv/mean_abs_step_conf": 0.7739882469177246,
"adv/ratio_final_to_reasoning": 2.324998679160373,
"adv/ratio_step_to_reasoning": 2.4328494501372737,
"adv/std_final_conf": 0.929745614528656,
"adv/std_reasoning": 0.6185531616210938,
"adv/std_step_conf": 0.935878574848175,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.94921875,
"calib/ece": 0.07687747035573127,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.000853842669485072,
"calib/mean_conf": 0.6714229249011858,
"calib/mu_c": 0.6711731843575419,
"calib/mu_w": 0.672027027027027,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.020395256916996046,
"calib/std_conf": 0.054202363724677044,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5869275603663614,
"calib/step_q_c_n": 2402.0,
"calib/step_q_gap": -0.011207597888557408,
"calib/step_q_w": 0.5981351582549188,
"calib/step_q_w_n": 1169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 904.71484375,
"completions/mean_terminated_length": 915.4427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.144,
"grad_norm": 0.149041548371315,
"kl": 0.05022430419921875,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0673,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.017318157479166985,
"mask/share_reasoning": 0.7906175851821899,
"mask/share_step_conf": 0.1803455352783203,
"num_tokens": 40977600.0,
"reward": 0.7551214694976807,
"reward_std": 0.20652639865875244,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7792269587516785,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3935159742832184,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.7377204895019531,
"adv/mean_abs_reasoning": 0.4371660351753235,
"adv/mean_abs_step_conf": 0.7849434614181519,
"adv/ratio_final_to_reasoning": 1.6875064166549298,
"adv/ratio_step_to_reasoning": 1.7955270955653126,
"adv/std_final_conf": 0.9307897090911865,
"adv/std_reasoning": 0.7205690145492554,
"adv/std_step_conf": 0.9362493753433228,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.6484375,
"calib/ece": 0.12819999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02222403764459846,
"calib/mean_conf": 0.6714,
"calib/mu_c": 0.6618881118881118,
"calib/mu_w": 0.6841121495327103,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11379999999999994,
"calib/std_conf": 0.051660816873138976,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5871563483735571,
"calib/step_q_c_n": 1906.0,
"calib/step_q_gap": -0.01733389023815657,
"calib/step_q_w": 0.6044902386117137,
"calib/step_q_w_n": 1844.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2221.0,
"completions/max_terminated_length": 2221.0,
"completions/mean_length": 855.8671875,
"completions/mean_terminated_length": 876.4080200195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.21199779212474823,
"kl": 0.05721282958984375,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.1456,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018238719552755356,
"mask/share_reasoning": 0.7721179723739624,
"mask/share_step_conf": 0.18620575964450836,
"num_tokens": 41305190.0,
"reward": 0.6414843201637268,
"reward_std": 0.22870557010173798,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7146027088165283,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.26133468747138977,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.7521461248397827,
"adv/mean_abs_reasoning": 0.2468286156654358,
"adv/mean_abs_step_conf": 0.7700768709182739,
"adv/ratio_final_to_reasoning": 3.047240380990834,
"adv/ratio_step_to_reasoning": 3.11988490006392,
"adv/std_final_conf": 0.9292258620262146,
"adv/std_reasoning": 0.5483195185661316,
"adv/std_step_conf": 0.9359169602394104,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 15.21484375,
"calib/ece": 0.07782608695652168,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": 0.006240981240981269,
"calib/mean_conf": 0.6781422924901186,
"calib/mu_c": 0.6805844155844156,
"calib/mu_w": 0.6743434343434344,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07363636363636357,
"calib/std_conf": 0.05971695019170884,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.597585152838428,
"calib/step_q_c_n": 2290.0,
"calib/step_q_gap": -0.00749584404630721,
"calib/step_q_w": 0.6050809968847352,
"calib/step_q_w_n": 1605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2522.0,
"completions/max_terminated_length": 2522.0,
"completions/mean_length": 930.125,
"completions/mean_terminated_length": 937.4487915039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.3232787549495697,
"kl": 0.0700531005859375,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0455,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.016633665189146996,
"mask/share_reasoning": 0.793748676776886,
"mask/share_step_conf": 0.18180516362190247,
"num_tokens": 41650286.0,
"reward": 0.7173147201538086,
"reward_std": 0.18486090004444122,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7475347518920898,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3691259026527405,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7692033052444458,
"adv/mean_abs_reasoning": 0.26124483346939087,
"adv/mean_abs_step_conf": 0.7495217323303223,
"adv/ratio_final_to_reasoning": 2.944377100320993,
"adv/ratio_step_to_reasoning": 2.8690394461643622,
"adv/std_final_conf": 0.9280076622962952,
"adv/std_reasoning": 0.5482669472694397,
"adv/std_step_conf": 0.935155987739563,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.046875,
"calib/ece": 0.05815686274509814,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.023529411764705882,
"calib/gap": 0.019387522768670284,
"calib/mean_conf": 0.6696078431372549,
"calib/mu_c": 0.6750819672131148,
"calib/mu_w": 0.6556944444444445,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.005058823529411749,
"calib/std_conf": 0.07222407355743857,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6015701515740381,
"calib/step_q_c_n": 2573.0,
"calib/step_q_gap": 0.0005046579278993324,
"calib/step_q_w": 0.6010654936461388,
"calib/step_q_w_n": 1023.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2919.0,
"completions/max_terminated_length": 2919.0,
"completions/mean_length": 862.86328125,
"completions/mean_terminated_length": 866.2471313476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.1472,
"grad_norm": 0.157817080616951,
"kl": 0.058624267578125,
"learning_rate": 1.75e-06,
"loss": 0.0231,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019831445068120956,
"mask/share_reasoning": 0.7864515781402588,
"mask/share_step_conf": 0.18981072306632996,
"num_tokens": 41975515.0,
"reward": 0.7384357452392578,
"reward_std": 0.19949795305728912,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7945871353149414,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.34009692072868347,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.7381882071495056,
"adv/mean_abs_reasoning": 0.3053765892982483,
"adv/mean_abs_step_conf": 0.7476175427436829,
"adv/ratio_final_to_reasoning": 2.417304511933456,
"adv/ratio_step_to_reasoning": 2.448182241021484,
"adv/std_final_conf": 0.9278108477592468,
"adv/std_reasoning": 0.5961070656776428,
"adv/std_step_conf": 0.9348329901695251,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.234375,
"calib/ece": 0.10700787401574796,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007726075504828844,
"calib/mean_conf": 0.6596850393700787,
"calib/mu_c": 0.6576470588235295,
"calib/mu_w": 0.6653731343283583,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.015236220472440934,
"calib/std_conf": 0.04211788869154169,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.583914309484193,
"calib/step_q_c_n": 2404.0,
"calib/step_q_gap": -0.00866699132881521,
"calib/step_q_w": 0.5925813008130082,
"calib/step_q_w_n": 984.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2539.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 805.84765625,
"completions/mean_terminated_length": 812.1929321289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.19154296815395355,
"kl": 0.05483245849609375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0483,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019182808697223663,
"mask/share_reasoning": 0.7837511301040649,
"mask/share_step_conf": 0.18925350904464722,
"num_tokens": 42284908.0,
"reward": 0.7721434831619263,
"reward_std": 0.18229490518569946,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7868027687072754,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.4137341380119324,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.7774754166603088,
"adv/mean_abs_reasoning": 0.32294416427612305,
"adv/mean_abs_step_conf": 0.7844225168228149,
"adv/ratio_final_to_reasoning": 2.407460801786012,
"adv/ratio_step_to_reasoning": 2.428972570478529,
"adv/std_final_conf": 0.9279150366783142,
"adv/std_reasoning": 0.5960806608200073,
"adv/std_step_conf": 0.9359557628631592,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.31640625,
"calib/ece": 0.12937254901960785,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02518849206349194,
"calib/mean_conf": 0.6697647058823528,
"calib/mu_c": 0.6635416666666667,
"calib/mu_w": 0.6887301587301586,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02309803921568629,
"calib/std_conf": 0.0580596678326277,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5849558638083229,
"calib/step_q_c_n": 2379.0,
"calib/step_q_gap": -0.017180058521774177,
"calib/step_q_w": 0.602135922330097,
"calib/step_q_w_n": 1030.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1960.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 872.99609375,
"completions/mean_terminated_length": 879.8700561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.16991457343101501,
"kl": 0.0538330078125,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0077,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01823488250374794,
"mask/share_reasoning": 0.790435791015625,
"mask/share_step_conf": 0.18351686000823975,
"num_tokens": 42613411.0,
"reward": 0.7748196721076965,
"reward_std": 0.20988968014717102,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7912160158157349,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.4092045724391937,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.7478249073028564,
"adv/mean_abs_reasoning": 0.30096516013145447,
"adv/mean_abs_step_conf": 0.7793318033218384,
"adv/ratio_final_to_reasoning": 2.4847557337740493,
"adv/ratio_step_to_reasoning": 2.5894419240467723,
"adv/std_final_conf": 0.9278432726860046,
"adv/std_reasoning": 0.5961489677429199,
"adv/std_step_conf": 0.9352448582649231,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 15.125,
"calib/ece": 0.10784000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.008,
"calib/gap": -0.00848232848232855,
"calib/mean_conf": 0.6748,
"calib/mu_c": 0.6725945945945946,
"calib/mu_w": 0.6810769230769231,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02132000000000002,
"calib/std_conf": 0.06133970981346424,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5923554356206631,
"calib/step_q_c_n": 2594.0,
"calib/step_q_gap": -0.02778540944975949,
"calib/step_q_w": 0.6201408450704226,
"calib/step_q_w_n": 1278.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2989.0,
"completions/max_terminated_length": 2989.0,
"completions/mean_length": 946.01953125,
"completions/mean_terminated_length": 964.8645629882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.1504,
"grad_norm": 0.1529771089553833,
"kl": 0.04474639892578125,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0675,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.016732092946767807,
"mask/share_reasoning": 0.7889929413795471,
"mask/share_step_conf": 0.17474375665187836,
"num_tokens": 42962688.0,
"reward": 0.7419514656066895,
"reward_std": 0.19250836968421936,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7776585817337036,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.3664005696773529,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.7602142095565796,
"adv/mean_abs_reasoning": 0.3405728042125702,
"adv/mean_abs_step_conf": 0.7700763940811157,
"adv/ratio_final_to_reasoning": 2.2321635789864422,
"adv/ratio_step_to_reasoning": 2.261121218594039,
"adv/std_final_conf": 0.9295332431793213,
"adv/std_reasoning": 0.6185359358787537,
"adv/std_step_conf": 0.936034083366394,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.9453125,
"calib/ece": 0.10357142857142856,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.011647058823529344,
"calib/mean_conf": 0.6673809523809524,
"calib/mu_c": 0.6626666666666666,
"calib/mu_w": 0.674313725490196,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08785714285714286,
"calib/std_conf": 0.05888081277783693,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5899589953869809,
"calib/step_q_c_n": 1951.0,
"calib/step_q_gap": -0.013444340005236421,
"calib/step_q_w": 0.6034033353922174,
"calib/step_q_w_n": 1619.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 902.99609375,
"completions/mean_terminated_length": 920.9840698242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.1717846393585205,
"kl": 0.04624176025390625,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0152,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01820656657218933,
"mask/share_reasoning": 0.7857927680015564,
"mask/share_step_conf": 0.17646938562393188,
"num_tokens": 43299015.0,
"reward": 0.6462733745574951,
"reward_std": 0.21243321895599365,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7331492304801941,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.24533499777317047,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.75215083360672,
"adv/mean_abs_reasoning": 0.325761616230011,
"adv/mean_abs_step_conf": 0.7678713202476501,
"adv/ratio_final_to_reasoning": 2.3088995023761414,
"adv/ratio_step_to_reasoning": 2.3571571418821735,
"adv/std_final_conf": 0.9295081496238708,
"adv/std_reasoning": 0.618578314781189,
"adv/std_step_conf": 0.935473620891571,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.97265625,
"calib/ece": 0.16708661417322834,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.03146934988938843,
"calib/mean_conf": 0.6662204724409448,
"calib/mu_c": 0.6561849710982659,
"calib/mu_w": 0.6876543209876543,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07610236220472441,
"calib/std_conf": 0.054439580521306874,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5799955015744489,
"calib/step_q_c_n": 2223.0,
"calib/step_q_gap": -0.026939505811075515,
"calib/step_q_w": 0.6069350073855244,
"calib/step_q_w_n": 1354.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2217.0,
"completions/max_terminated_length": 2217.0,
"completions/mean_length": 894.27734375,
"completions/mean_terminated_length": 904.8814697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 354.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.4571000337600708,
"kl": 0.05872344970703125,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0381,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.017354609444737434,
"mask/share_reasoning": 0.795860767364502,
"mask/share_step_conf": 0.17506583034992218,
"num_tokens": 43635286.0,
"reward": 0.7234030365943909,
"reward_std": 0.22833162546157837,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7575539350509644,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.35643959045410156,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.7369581460952759,
"adv/mean_abs_reasoning": 0.28275996446609497,
"adv/mean_abs_step_conf": 0.71214759349823,
"adv/ratio_final_to_reasoning": 2.6063030085846637,
"adv/ratio_step_to_reasoning": 2.518558788345094,
"adv/std_final_conf": 0.9275577664375305,
"adv/std_reasoning": 0.5959152579307556,
"adv/std_step_conf": 0.9351029396057129,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.6796875,
"calib/ece": 0.14607843137254897,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.006871091871091584,
"calib/mean_conf": 0.658392156862745,
"calib/mu_c": 0.6566137566137568,
"calib/mu_w": 0.6634848484848483,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0316470588235294,
"calib/std_conf": 0.05160334326886597,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5831010752688173,
"calib/step_q_c_n": 2325.0,
"calib/step_q_gap": -0.006204027879934126,
"calib/step_q_w": 0.5893051031487514,
"calib/step_q_w_n": 921.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 842.84375,
"completions/mean_terminated_length": 849.4802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.1536,
"grad_norm": 0.17217902839183807,
"kl": 0.05643463134765625,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0481,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019316058605909348,
"mask/share_reasoning": 0.7909666299819946,
"mask/share_step_conf": 0.18190476298332214,
"num_tokens": 43955182.0,
"reward": 0.7673730850219727,
"reward_std": 0.15579378604888916,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7929043173789978,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3949667513370514,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.719872236251831,
"adv/mean_abs_reasoning": 0.396770179271698,
"adv/mean_abs_step_conf": 0.736086368560791,
"adv/ratio_final_to_reasoning": 1.8143304962414555,
"adv/ratio_step_to_reasoning": 1.855195795994381,
"adv/std_final_conf": 0.9305780529975891,
"adv/std_reasoning": 0.7011988162994385,
"adv/std_step_conf": 0.9361504316329956,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.390625,
"calib/ece": 0.07204724409448815,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.011173909940769278,
"calib/mean_conf": 0.674488188976378,
"calib/mu_c": 0.6709248554913294,
"calib/mu_w": 0.6820987654320987,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03271653543307088,
"calib/std_conf": 0.06198435999606487,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5919049604001667,
"calib/step_q_c_n": 2399.0,
"calib/step_q_gap": -0.01571371664263488,
"calib/step_q_w": 0.6076186770428016,
"calib/step_q_w_n": 1285.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1957.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 879.0859375,
"completions/mean_terminated_length": 889.5098876953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.17100566625595093,
"kl": 0.05695343017578125,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0269,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.018207937479019165,
"mask/share_reasoning": 0.7837401032447815,
"mask/share_step_conf": 0.18633320927619934,
"num_tokens": 44282932.0,
"reward": 0.6938064098358154,
"reward_std": 0.24391242861747742,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7680108547210693,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.28600814938545227,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7830526828765869,
"adv/mean_abs_reasoning": 0.31210857629776,
"adv/mean_abs_step_conf": 0.7571125626564026,
"adv/ratio_final_to_reasoning": 2.50891113651915,
"adv/ratio_step_to_reasoning": 2.4257986487820724,
"adv/std_final_conf": 0.9304502606391907,
"adv/std_reasoning": 0.5727735757827759,
"adv/std_step_conf": 0.9361012578010559,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.39453125,
"calib/ece": 0.08645669291338594,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.009376797698945394,
"calib/mean_conf": 0.6730708661417322,
"calib/mu_c": 0.6691946308724832,
"calib/mu_w": 0.6785714285714286,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08645669291338594,
"calib/std_conf": 0.06035996758325317,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5853272532188841,
"calib/step_q_c_n": 1864.0,
"calib/step_q_gap": -0.01265357745204243,
"calib/step_q_w": 0.5979808306709266,
"calib/step_q_w_n": 1565.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2374.0,
"completions/max_terminated_length": 2374.0,
"completions/mean_length": 899.69921875,
"completions/mean_terminated_length": 910.3676147460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.15870602428913116,
"kl": 0.05303192138671875,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0074,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01740969344973564,
"mask/share_reasoning": 0.7895663380622864,
"mask/share_step_conf": 0.1813051998615265,
"num_tokens": 44620471.0,
"reward": 0.6407595872879028,
"reward_std": 0.2192278802394867,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7360406517982483,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.230634868144989,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.7577475309371948,
"adv/mean_abs_reasoning": 0.26077109575271606,
"adv/mean_abs_step_conf": 0.7750769257545471,
"adv/ratio_final_to_reasoning": 2.905795708492752,
"adv/ratio_step_to_reasoning": 2.972250139599585,
"adv/std_final_conf": 0.9285784959793091,
"adv/std_reasoning": 0.548322319984436,
"adv/std_step_conf": 0.9357059001922607,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.48828125,
"calib/ece": 0.06952000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.004,
"calib/gap": -0.014648556876061036,
"calib/mean_conf": 0.67376,
"calib/mu_c": 0.6681935483870968,
"calib/mu_w": 0.6828421052631578,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06164000000000003,
"calib/std_conf": 0.05776731255649686,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5893780369290573,
"calib/step_q_c_n": 2058.0,
"calib/step_q_gap": -0.020761272580330892,
"calib/step_q_w": 0.6101393095093882,
"calib/step_q_w_n": 1651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2207.0,
"completions/max_terminated_length": 2207.0,
"completions/mean_length": 899.05078125,
"completions/mean_terminated_length": 916.960205078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.1568,
"grad_norm": 0.15729469060897827,
"kl": 0.0526580810546875,
"learning_rate": 1.5e-06,
"loss": -0.0615,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.017730841413140297,
"mask/share_reasoning": 0.7814549207687378,
"mask/share_step_conf": 0.18128295242786407,
"num_tokens": 44954308.0,
"reward": 0.6083645820617676,
"reward_std": 0.19842839241027832,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.733662486076355,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.1666603982448578,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.7911767363548279,
"adv/mean_abs_reasoning": 0.23125512897968292,
"adv/mean_abs_step_conf": 0.7521365880966187,
"adv/ratio_final_to_reasoning": 3.4212289251510493,
"adv/ratio_step_to_reasoning": 3.252410406701501,
"adv/std_final_conf": 0.9272419810295105,
"adv/std_reasoning": 0.49597135186195374,
"adv/std_step_conf": 0.9352139234542847,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.7734375,
"calib/ece": 0.1997265625000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03117959949937421,
"calib/mean_conf": 0.6663671875,
"calib/mu_c": 0.6580851063829787,
"calib/mu_w": 0.6892647058823529,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06585937500000005,
"calib/std_conf": 0.058340376655793405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5856148208469055,
"calib/step_q_c_n": 2456.0,
"calib/step_q_gap": -0.012441253919449613,
"calib/step_q_w": 0.5980560747663551,
"calib/step_q_w_n": 1070.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1897.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 830.12890625,
"completions/mean_terminated_length": 836.6653442382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.1504538506269455,
"kl": 0.0512542724609375,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0055,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02039681002497673,
"mask/share_reasoning": 0.7797145247459412,
"mask/share_step_conf": 0.192076176404953,
"num_tokens": 45271933.0,
"reward": 0.7831298112869263,
"reward_std": 0.1646493375301361,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7847386598587036,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.43464604020118713,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.7535303831100464,
"adv/mean_abs_reasoning": 0.2745121121406555,
"adv/mean_abs_step_conf": 0.7678822875022888,
"adv/ratio_final_to_reasoning": 2.7449804572701324,
"adv/ratio_step_to_reasoning": 2.797261955089393,
"adv/std_final_conf": 0.9277353882789612,
"adv/std_reasoning": 0.5726152062416077,
"adv/std_step_conf": 0.9349270462989807,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.31640625,
"calib/ece": 0.09976377952755902,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.023622047244094488,
"calib/gap": -0.027439775910364173,
"calib/mean_conf": 0.674015748031496,
"calib/mu_c": 0.6649411764705881,
"calib/mu_w": 0.6923809523809523,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05224409448818898,
"calib/std_conf": 0.07041787356455628,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5885290208241029,
"calib/step_q_c_n": 2257.0,
"calib/step_q_gap": -0.023530638266806236,
"calib/step_q_w": 0.6120596590909091,
"calib/step_q_w_n": 1408.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2873.0,
"completions/max_terminated_length": 2873.0,
"completions/mean_length": 945.51171875,
"completions/mean_terminated_length": 952.9566650390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.1864592432975769,
"kl": 0.0453643798828125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0194,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.017488349229097366,
"mask/share_reasoning": 0.7987071871757507,
"mask/share_step_conf": 0.175991952419281,
"num_tokens": 45618440.0,
"reward": 0.7075504660606384,
"reward_std": 0.17754337191581726,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7555820345878601,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.3282688856124878,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.7613473534584045,
"adv/mean_abs_reasoning": 0.17415973544120789,
"adv/mean_abs_step_conf": 0.7704967260360718,
"adv/ratio_final_to_reasoning": 4.371546336641152,
"adv/ratio_step_to_reasoning": 4.424080710068447,
"adv/std_final_conf": 0.924812912940979,
"adv/std_reasoning": 0.4675443768501282,
"adv/std_step_conf": 0.9354718327522278,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.4375,
"calib/ece": 0.09329411764705892,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": -0.01556890103567321,
"calib/mean_conf": 0.6520392156862744,
"calib/mu_c": 0.6472159090909091,
"calib/mu_w": 0.6627848101265823,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.027568627450980397,
"calib/std_conf": 0.05854241324377105,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5801556603773586,
"calib/step_q_c_n": 2120.0,
"calib/step_q_gap": -0.010671407291814372,
"calib/step_q_w": 0.5908270676691729,
"calib/step_q_w_n": 1064.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 804.04296875,
"completions/mean_terminated_length": 810.3740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.16,
"grad_norm": 0.11460676789283752,
"kl": 0.055023193359375,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0341,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02103450521826744,
"mask/share_reasoning": 0.7882153987884521,
"mask/share_step_conf": 0.1829375922679901,
"num_tokens": 45929235.0,
"reward": 0.7221521735191345,
"reward_std": 0.14214414358139038,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7698402404785156,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.33852654695510864,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.768987774848938,
"adv/mean_abs_reasoning": 0.28910064697265625,
"adv/mean_abs_step_conf": 0.7840760946273804,
"adv/ratio_final_to_reasoning": 2.6599310063864037,
"adv/ratio_step_to_reasoning": 2.7121215494946296,
"adv/std_final_conf": 0.9297264218330383,
"adv/std_reasoning": 0.5726152062416077,
"adv/std_step_conf": 0.9357607960700989,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.69140625,
"calib/ece": 0.0995256916996047,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007418688693856423,
"calib/mean_conf": 0.6748616600790515,
"calib/mu_c": 0.6718120805369127,
"calib/mu_w": 0.6792307692307691,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09272727272727267,
"calib/std_conf": 0.06690563699177939,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5945520934761441,
"calib/step_q_c_n": 2054.0,
"calib/step_q_gap": -0.00992945309679083,
"calib/step_q_w": 0.604481546572935,
"calib/step_q_w_n": 1707.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2756.0,
"completions/max_terminated_length": 2756.0,
"completions/mean_length": 924.71875,
"completions/mean_terminated_length": 939.3968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.19589243829250336,
"kl": 0.0519866943359375,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0981,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.017413543537259102,
"mask/share_reasoning": 0.7887399196624756,
"mask/share_step_conf": 0.17822150886058807,
"num_tokens": 46272987.0,
"reward": 0.6662525534629822,
"reward_std": 0.18551214039325714,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7337561845779419,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.2846863567829132,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7441304922103882,
"adv/mean_abs_reasoning": 0.45768579840660095,
"adv/mean_abs_step_conf": 0.7441684007644653,
"adv/ratio_final_to_reasoning": 1.6258544503697145,
"adv/ratio_step_to_reasoning": 1.6259372769599412,
"adv/std_final_conf": 0.9302271008491516,
"adv/std_reasoning": 0.7392387986183167,
"adv/std_step_conf": 0.9360255599021912,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.0703125,
"calib/ece": 0.12772908366533864,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.00796812749003984,
"calib/gap": -0.03468060052987931,
"calib/mean_conf": 0.6623107569721116,
"calib/mu_c": 0.6513953488372093,
"calib/mu_w": 0.6860759493670886,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05239043824701195,
"calib/std_conf": 0.06678322429797894,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5809782082324455,
"calib/step_q_c_n": 2065.0,
"calib/step_q_gap": -0.02828799005014626,
"calib/step_q_w": 0.6092661982825918,
"calib/step_q_w_n": 1281.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2708.0,
"completions/max_terminated_length": 2708.0,
"completions/mean_length": 864.37890625,
"completions/mean_terminated_length": 878.0992431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 361.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.21392974257469177,
"kl": 0.05400848388671875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0955,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019230522215366364,
"mask/share_reasoning": 0.7847087383270264,
"mask/share_step_conf": 0.18043574690818787,
"num_tokens": 46599660.0,
"reward": 0.7458244562149048,
"reward_std": 0.2385040670633316,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7494453191757202,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.41173478960990906,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7392740249633789,
"adv/mean_abs_reasoning": 0.22879235446453094,
"adv/mean_abs_step_conf": 0.7788934707641602,
"adv/ratio_final_to_reasoning": 3.2312007396120683,
"adv/ratio_step_to_reasoning": 3.4043684396145757,
"adv/std_final_conf": 0.9284136891365051,
"adv/std_reasoning": 0.5227508544921875,
"adv/std_step_conf": 0.9354473948478699,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.27734375,
"calib/ece": 0.10626984126984128,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.007936507936507936,
"calib/gap": -0.031333333333333435,
"calib/mean_conf": 0.6703968253968254,
"calib/mu_c": 0.6614444444444444,
"calib/mu_w": 0.6927777777777778,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03119047619047619,
"calib/std_conf": 0.05615879969846835,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.582473164448261,
"calib/step_q_c_n": 2329.0,
"calib/step_q_gap": -0.038959716396384514,
"calib/step_q_w": 0.6214328808446455,
"calib/step_q_w_n": 1326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 937.6171875,
"completions/mean_terminated_length": 956.2948608398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.1632,
"grad_norm": 0.12315616756677628,
"kl": 0.0469512939453125,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0565,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.016671668738126755,
"mask/share_reasoning": 0.7928116321563721,
"mask/share_step_conf": 0.17098543047904968,
"num_tokens": 46947010.0,
"reward": 0.7480127811431885,
"reward_std": 0.17629170417785645,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7658921480178833,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3926334083080292,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.7574979066848755,
"adv/mean_abs_reasoning": 0.3248511850833893,
"adv/mean_abs_step_conf": 0.7860252857208252,
"adv/ratio_final_to_reasoning": 2.3318305164576385,
"adv/ratio_step_to_reasoning": 2.419647277934518,
"adv/std_final_conf": 0.9292215704917908,
"adv/std_reasoning": 0.6185224056243896,
"adv/std_step_conf": 0.9357133507728577,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.3125,
"calib/ece": 0.0781746031746032,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.00029931972789110084,
"calib/mean_conf": 0.6615079365079365,
"calib/mu_c": 0.6616326530612244,
"calib/mu_w": 0.6613333333333333,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0781746031746032,
"calib/std_conf": 0.042380878069015165,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5797038233710287,
"calib/step_q_c_n": 1857.0,
"calib/step_q_gap": -0.019696563476166773,
"calib/step_q_w": 0.5994003868471954,
"calib/step_q_w_n": 1551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2044.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 851.70703125,
"completions/mean_terminated_length": 865.2262573242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.16613580286502838,
"kl": 0.05309295654296875,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0603,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.017889706417918205,
"mask/share_reasoning": 0.7869839668273926,
"mask/share_step_conf": 0.17950135469436646,
"num_tokens": 47269487.0,
"reward": 0.633529007434845,
"reward_std": 0.22433821856975555,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7374765276908875,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.21786263585090637,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.7702876925468445,
"adv/mean_abs_reasoning": 0.42663636803627014,
"adv/mean_abs_step_conf": 0.7740648984909058,
"adv/ratio_final_to_reasoning": 1.8054899916112146,
"adv/ratio_step_to_reasoning": 1.8143434467478385,
"adv/std_final_conf": 0.9317903518676758,
"adv/std_reasoning": 0.7012701034545898,
"adv/std_step_conf": 0.9358920454978943,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.796875,
"calib/ece": 0.08015624999999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.014206773618538282,
"calib/mean_conf": 0.664296875,
"calib/mu_c": 0.6586363636363637,
"calib/mu_w": 0.672843137254902,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07144531249999997,
"calib/std_conf": 0.054085921136968494,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5821870286576171,
"calib/step_q_c_n": 1989.0,
"calib/step_q_gap": -0.010321072444132828,
"calib/step_q_w": 0.5925081011017499,
"calib/step_q_w_n": 1543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1765.0,
"completions/max_terminated_length": 1765.0,
"completions/mean_length": 835.6328125,
"completions/mean_terminated_length": 842.2125854492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 352.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.20822681486606598,
"kl": 0.0546417236328125,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0081,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018709741532802582,
"mask/share_reasoning": 0.788835883140564,
"mask/share_step_conf": 0.18464188277721405,
"num_tokens": 47590625.0,
"reward": 0.6713146567344666,
"reward_std": 0.2506943941116333,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7466437816619873,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.2756730318069458,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.7726517915725708,
"adv/mean_abs_reasoning": 0.24603010714054108,
"adv/mean_abs_step_conf": 0.7691492438316345,
"adv/ratio_final_to_reasoning": 3.140476588628256,
"adv/ratio_step_to_reasoning": 3.1262403320105427,
"adv/std_final_conf": 0.9284862875938416,
"adv/std_reasoning": 0.5227888226509094,
"adv/std_step_conf": 0.9357995390892029,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.28515625,
"calib/ece": 0.10900398406374504,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.021861842105263096,
"calib/mean_conf": 0.6622310756972111,
"calib/mu_c": 0.6552631578947369,
"calib/mu_w": 0.677125,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0449800796812749,
"calib/std_conf": 0.05450065234484988,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804718779790278,
"calib/step_q_c_n": 2098.0,
"calib/step_q_gap": -0.017632496541309828,
"calib/step_q_w": 0.5981043745203376,
"calib/step_q_w_n": 1303.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2542.0,
"completions/max_terminated_length": 2542.0,
"completions/mean_length": 847.90625,
"completions/mean_terminated_length": 868.2560424804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.1664,
"grad_norm": 0.18850506842136383,
"kl": 0.05255126953125,
"learning_rate": 1.25e-06,
"loss": -0.0312,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018288226798176765,
"mask/share_reasoning": 0.7817035913467407,
"mask/share_step_conf": 0.17657069861888885,
"num_tokens": 47912449.0,
"reward": 0.717280387878418,
"reward_std": 0.18664982914924622,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7549937963485718,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3498794436454773,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.765222430229187,
"adv/mean_abs_reasoning": 0.3630412220954895,
"adv/mean_abs_step_conf": 0.7838761210441589,
"adv/ratio_final_to_reasoning": 2.1078114099888996,
"adv/ratio_step_to_reasoning": 2.1591931531069455,
"adv/std_final_conf": 0.928774356842041,
"adv/std_reasoning": 0.6402490735054016,
"adv/std_step_conf": 0.9357649683952332,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.8828125,
"calib/ece": 0.06405511811023627,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.007035714285714478,
"calib/mean_conf": 0.6668110236220472,
"calib/mu_c": 0.6687500000000001,
"calib/mu_w": 0.6617142857142856,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0032283464566929135,
"calib/std_conf": 0.0634666794943521,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5885262689225289,
"calib/step_q_c_n": 2246.0,
"calib/step_q_gap": -0.01572278050712894,
"calib/step_q_w": 0.6042490494296578,
"calib/step_q_w_n": 1052.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1876.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 837.375,
"completions/mean_terminated_length": 847.3043823242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.200291246175766,
"kl": 0.05327606201171875,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0404,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0200999416410923,
"mask/share_reasoning": 0.7814955711364746,
"mask/share_step_conf": 0.18668577075004578,
"num_tokens": 48230545.0,
"reward": 0.7479537129402161,
"reward_std": 0.22217577695846558,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7896058559417725,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.36411404609680176,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.7526696920394897,
"adv/mean_abs_reasoning": 0.3639039099216461,
"adv/mean_abs_step_conf": 0.7706518769264221,
"adv/ratio_final_to_reasoning": 2.0683198820302606,
"adv/ratio_step_to_reasoning": 2.117734533526597,
"adv/std_final_conf": 0.9300745129585266,
"adv/std_reasoning": 0.6402790546417236,
"adv/std_step_conf": 0.9360171556472778,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.05859375,
"calib/ece": 0.18531496062992123,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.028512839059674278,
"calib/mean_conf": 0.6668110236220472,
"calib/mu_c": 0.6579428571428573,
"calib/mu_w": 0.6864556962025316,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0815748031496063,
"calib/std_conf": 0.06711480729909547,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804814636494945,
"calib/step_q_c_n": 2077.0,
"calib/step_q_gap": -0.02657224883075826,
"calib/step_q_w": 0.6070537124802527,
"calib/step_q_w_n": 1266.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 878.85546875,
"completions/mean_terminated_length": 885.7755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 271.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.1852419376373291,
"kl": 0.05022430419921875,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0119,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019303487613797188,
"mask/share_reasoning": 0.789633572101593,
"mask/share_step_conf": 0.18325044214725494,
"num_tokens": 48560772.0,
"reward": 0.6829037666320801,
"reward_std": 0.21487480401992798,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7624925374984741,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.26815879344940186,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.7331850528717041,
"adv/mean_abs_reasoning": 0.38067251443862915,
"adv/mean_abs_step_conf": 0.7472409009933472,
"adv/ratio_final_to_reasoning": 1.9260257178087017,
"adv/ratio_step_to_reasoning": 1.9629494451294698,
"adv/std_final_conf": 0.9297038912773132,
"adv/std_reasoning": 0.6815901398658752,
"adv/std_step_conf": 0.9354227185249329,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.9921875,
"calib/ece": 0.14879999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007530248033877807,
"calib/mean_conf": 0.65568,
"calib/mu_c": 0.6533908045977012,
"calib/mu_w": 0.660921052631579,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05423999999999999,
"calib/std_conf": 0.05591902717322612,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5796999531176746,
"calib/step_q_c_n": 2133.0,
"calib/step_q_gap": -0.024432486111160223,
"calib/step_q_w": 0.6041324392288349,
"calib/step_q_w_n": 1193.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2318.0,
"completions/max_terminated_length": 2318.0,
"completions/mean_length": 792.640625,
"completions/mean_terminated_length": 811.6640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 374.0,
"epoch": 0.1696,
"grad_norm": 0.22308699786663055,
"kl": 0.0548553466796875,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0441,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01981358602643013,
"mask/share_reasoning": 0.7754086256027222,
"mask/share_step_conf": 0.1813403069972992,
"num_tokens": 48868472.0,
"reward": 0.7687622308731079,
"reward_std": 0.2001192569732666,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7621843814849854,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.4440900683403015,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.762536883354187,
"adv/mean_abs_reasoning": 0.4082333445549011,
"adv/mean_abs_step_conf": 0.7621127367019653,
"adv/ratio_final_to_reasoning": 1.86789465761447,
"adv/ratio_step_to_reasoning": 1.8668556767034812,
"adv/std_final_conf": 0.9294963479042053,
"adv/std_reasoning": 0.68171626329422,
"adv/std_step_conf": 0.9357032775878906,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 13.76171875,
"calib/ece": 0.08971774193548387,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.020982956215104265,
"calib/mean_conf": 0.6575403225806451,
"calib/mu_c": 0.6506024096385542,
"calib/mu_w": 0.6715853658536585,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03895161290322579,
"calib/std_conf": 0.0517515621072553,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.579351145038168,
"calib/step_q_c_n": 2096.0,
"calib/step_q_gap": -0.02983596077823003,
"calib/step_q_w": 0.609187105816398,
"calib/step_q_w_n": 1427.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 804.671875,
"completions/mean_terminated_length": 830.6290283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.18031994998455048,
"kl": 0.053371429443359375,
"learning_rate": 1.138888888888889e-06,
"loss": -0.1049,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018375877290964127,
"mask/share_reasoning": 0.772445797920227,
"mask/share_step_conf": 0.17792832851409912,
"num_tokens": 49179308.0,
"reward": 0.7028266191482544,
"reward_std": 0.22729066014289856,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7426198720932007,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.33959585428237915,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.747994065284729,
"adv/mean_abs_reasoning": 0.27806806564331055,
"adv/mean_abs_step_conf": 0.7542315125465393,
"adv/ratio_final_to_reasoning": 2.6899675212766505,
"adv/ratio_step_to_reasoning": 2.7123988898244193,
"adv/std_final_conf": 0.9273564219474792,
"adv/std_reasoning": 0.5725544691085815,
"adv/std_step_conf": 0.9354497194290161,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.4296875,
"calib/ece": 0.11035156249999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.003320174356443606,
"calib/mean_conf": 0.6501953125,
"calib/mu_c": 0.649378238341969,
"calib/mu_w": 0.6526984126984126,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0033203125,
"calib/std_conf": 0.04322408157529023,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5743053763440861,
"calib/step_q_c_n": 2325.0,
"calib/step_q_gap": -0.009486922372366702,
"calib/step_q_w": 0.5837922987164528,
"calib/step_q_w_n": 857.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1714.0,
"completions/max_terminated_length": 1714.0,
"completions/mean_length": 776.5078125,
"completions/mean_terminated_length": 782.6220703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.14907941222190857,
"kl": 0.05158233642578125,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0043,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020560087636113167,
"mask/share_reasoning": 0.7882505655288696,
"mask/share_step_conf": 0.18337681889533997,
"num_tokens": 49482014.0,
"reward": 0.7526839375495911,
"reward_std": 0.1831292361021042,
"rewards/accuracy_reward_step": 0.75390625,
"rewards/final_brier_reward_step": 0.8006120920181274,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.3539745807647705,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.7359354496002197,
"adv/mean_abs_reasoning": 0.27248501777648926,
"adv/mean_abs_step_conf": 0.748088002204895,
"adv/ratio_final_to_reasoning": 2.700829042292094,
"adv/ratio_step_to_reasoning": 2.7454280176920687,
"adv/std_final_conf": 0.9246222972869873,
"adv/std_reasoning": 0.5725796818733215,
"adv/std_step_conf": 0.9342425465583801,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.2265625,
"calib/ece": 0.15486166007905147,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.009047290640394223,
"calib/mean_conf": 0.6549407114624506,
"calib/mu_c": 0.6531527093596058,
"calib/mu_w": 0.6622,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0037154150197628456,
"calib/std_conf": 0.05366872882087633,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5769044585987262,
"calib/step_q_c_n": 2355.0,
"calib/step_q_gap": -0.04254070269159649,
"calib/step_q_w": 0.6194451612903227,
"calib/step_q_w_n": 775.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2069.0,
"completions/max_terminated_length": 2069.0,
"completions/mean_length": 780.484375,
"completions/mean_terminated_length": 792.873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.1728,
"grad_norm": 0.17687027156352997,
"kl": 0.05267333984375,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0147,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.020576588809490204,
"mask/share_reasoning": 0.7769348621368408,
"mask/share_step_conf": 0.18686355650424957,
"num_tokens": 49785962.0,
"reward": 0.8413803577423096,
"reward_std": 0.1559084951877594,
"rewards/accuracy_reward_step": 0.79296875,
"rewards/final_brier_reward_step": 0.8044047355651855,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.5221060514450073,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.7620823383331299,
"adv/mean_abs_reasoning": 0.3502871096134186,
"adv/mean_abs_step_conf": 0.7656612992286682,
"adv/ratio_final_to_reasoning": 2.1755934415462046,
"adv/ratio_step_to_reasoning": 2.1858106627836293,
"adv/std_final_conf": 0.930777370929718,
"adv/std_reasoning": 0.6402459740638733,
"adv/std_step_conf": 0.9347506761550903,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.125,
"calib/ece": 0.07142292490118575,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.011347168734845314,
"calib/mean_conf": 0.6695256916996047,
"calib/mu_c": 0.6658479532163742,
"calib/mu_w": 0.6771951219512196,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03252964426877467,
"calib/std_conf": 0.057378263743452985,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5843631436314363,
"calib/step_q_c_n": 2214.0,
"calib/step_q_gap": -0.02153557248839255,
"calib/step_q_w": 0.6058987161198288,
"calib/step_q_w_n": 1402.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2330.0,
"completions/max_terminated_length": 2330.0,
"completions/mean_length": 904.625,
"completions/mean_terminated_length": 915.351806640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.16011030972003937,
"kl": 0.05437469482421875,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0748,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01856684312224388,
"mask/share_reasoning": 0.7873039245605469,
"mask/share_step_conf": 0.18241044878959656,
"num_tokens": 50122378.0,
"reward": 0.7258908748626709,
"reward_std": 0.1946534365415573,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7635785341262817,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3569532632827759,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.7561768293380737,
"adv/mean_abs_reasoning": 0.3443371057510376,
"adv/mean_abs_step_conf": 0.7469892501831055,
"adv/ratio_final_to_reasoning": 2.1960364326370456,
"adv/ratio_step_to_reasoning": 2.1693545008860973,
"adv/std_final_conf": 0.9291502237319946,
"adv/std_reasoning": 0.6402323842048645,
"adv/std_step_conf": 0.9354658722877502,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.19140625,
"calib/ece": 0.14956521739130432,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03309090909090917,
"calib/mean_conf": 0.6747826086956522,
"calib/mu_c": 0.6632727272727272,
"calib/mu_w": 0.6963636363636364,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08608695652173909,
"calib/std_conf": 0.05579865850025347,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5821397797989469,
"calib/step_q_c_n": 2089.0,
"calib/step_q_gap": -0.023320064760638526,
"calib/step_q_w": 0.6054598445595855,
"calib/step_q_w_n": 1544.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2112.0,
"completions/max_terminated_length": 2112.0,
"completions/mean_length": 943.09765625,
"completions/mean_terminated_length": 958.0675048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.1804104745388031,
"kl": 0.05059051513671875,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0569,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01607145369052887,
"mask/share_reasoning": 0.7968579530715942,
"mask/share_step_conf": 0.1714455485343933,
"num_tokens": 50469947.0,
"reward": 0.6703404188156128,
"reward_std": 0.2007838785648346,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7456773519515991,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.26844096183776855,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.7765575647354126,
"adv/mean_abs_reasoning": 0.42560875415802,
"adv/mean_abs_step_conf": 0.7829569578170776,
"adv/ratio_final_to_reasoning": 1.8245808084273856,
"adv/ratio_step_to_reasoning": 1.8396166671101444,
"adv/std_final_conf": 0.9294382929801941,
"adv/std_reasoning": 0.6816999912261963,
"adv/std_step_conf": 0.9358112812042236,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 13.54296875,
"calib/ece": 0.10681102362204728,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.016648547207661957,
"calib/mean_conf": 0.6614566929133859,
"calib/mu_c": 0.6548366013071896,
"calib/mu_w": 0.6714851485148515,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08295275590551185,
"calib/std_conf": 0.05583866747974536,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804133961276817,
"calib/step_q_c_n": 1911.0,
"calib/step_q_gap": -0.019663724694940377,
"calib/step_q_w": 0.6000771208226221,
"calib/step_q_w_n": 1556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2306.0,
"completions/max_terminated_length": 2306.0,
"completions/mean_length": 900.40234375,
"completions/mean_terminated_length": 907.4921264648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.176,
"grad_norm": 0.18720674514770508,
"kl": 0.0489501953125,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0342,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017843585461378098,
"mask/share_reasoning": 0.8022754788398743,
"mask/share_step_conf": 0.17206846177577972,
"num_tokens": 50806026.0,
"reward": 0.6247101426124573,
"reward_std": 0.2394401729106903,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7400652170181274,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.19138632714748383,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.7800743579864502,
"adv/mean_abs_reasoning": 0.32665368914604187,
"adv/mean_abs_step_conf": 0.7688565850257874,
"adv/ratio_final_to_reasoning": 2.388077599937011,
"adv/ratio_step_to_reasoning": 2.3537361143410913,
"adv/std_final_conf": 0.9293928742408752,
"adv/std_reasoning": 0.6185818314552307,
"adv/std_step_conf": 0.9357060790061951,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.0390625,
"calib/ece": 0.1617391304347827,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.011857707509881422,
"calib/gap": -0.037626811594203,
"calib/mean_conf": 0.6672727272727274,
"calib/mu_c": 0.6570108695652174,
"calib/mu_w": 0.6946376811594204,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05086956521739133,
"calib/std_conf": 0.06375032145367865,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5857695729537367,
"calib/step_q_c_n": 2248.0,
"calib/step_q_gap": -0.031221252734336735,
"calib/step_q_w": 0.6169908256880734,
"calib/step_q_w_n": 1090.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 868.63671875,
"completions/mean_terminated_length": 878.936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.16540831327438354,
"kl": 0.0525665283203125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0487,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01886698603630066,
"mask/share_reasoning": 0.7834525108337402,
"mask/share_step_conf": 0.1859617829322815,
"num_tokens": 51134581.0,
"reward": 0.8182835578918457,
"reward_std": 0.2058083713054657,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7699328064918518,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.5252280235290527,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.7705814838409424,
"adv/mean_abs_reasoning": 0.2326258420944214,
"adv/mean_abs_step_conf": 0.7521966695785522,
"adv/ratio_final_to_reasoning": 3.3125360316940546,
"adv/ratio_step_to_reasoning": 3.2335043381519077,
"adv/std_final_conf": 0.926654577255249,
"adv/std_reasoning": 0.5227459669113159,
"adv/std_step_conf": 0.935153067111969,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.265625,
"calib/ece": 0.13760784313725483,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.009097560975609653,
"calib/mean_conf": 0.6663137254901962,
"calib/mu_c": 0.6680975609756097,
"calib/mu_w": 0.659,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05485577889591566,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5880213903743315,
"calib/step_q_c_n": 2805.0,
"calib/step_q_gap": 0.00788602658414539,
"calib/step_q_w": 0.5801353637901862,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1724.0,
"completions/max_terminated_length": 1724.0,
"completions/mean_length": 874.4453125,
"completions/mean_terminated_length": 881.3306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 332.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.12058939784765244,
"kl": 0.046085357666015625,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0172,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017945684492588043,
"mask/share_reasoning": 0.7946851849555969,
"mask/share_step_conf": 0.17955660820007324,
"num_tokens": 51464047.0,
"reward": 0.795635461807251,
"reward_std": 0.16444918513298035,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8200753927230835,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.4118204712867737,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.7644122242927551,
"adv/mean_abs_reasoning": 0.35399892926216125,
"adv/mean_abs_step_conf": 0.7623947262763977,
"adv/ratio_final_to_reasoning": 2.1593630971879403,
"adv/ratio_step_to_reasoning": 2.1536639330109115,
"adv/std_final_conf": 0.9302164316177368,
"adv/std_reasoning": 0.6402161717414856,
"adv/std_step_conf": 0.9356199502944946,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 15.15625,
"calib/ece": 0.1268627450980392,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.01568627450980392,
"calib/gap": -0.03547490347490334,
"calib/mean_conf": 0.6865490196078431,
"calib/mu_c": 0.6768108108108108,
"calib/mu_w": 0.7122857142857142,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04396078431372551,
"calib/std_conf": 0.07045407730460142,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5919820522824815,
"calib/step_q_c_n": 2563.0,
"calib/step_q_gap": -0.027364948476820028,
"calib/step_q_w": 0.6193470007593015,
"calib/step_q_w_n": 1317.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2480.0,
"completions/max_terminated_length": 2480.0,
"completions/mean_length": 980.65234375,
"completions/mean_terminated_length": 988.3740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 314.0,
"epoch": 0.1792,
"grad_norm": 0.1715572476387024,
"kl": 0.0460968017578125,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0237,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.016328081488609314,
"mask/share_reasoning": 0.7952143549919128,
"mask/share_step_conf": 0.18064509332180023,
"num_tokens": 51819766.0,
"reward": 0.7117259502410889,
"reward_std": 0.21040663123130798,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7771878838539124,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3025140166282654,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.7541103363037109,
"adv/mean_abs_reasoning": 0.2814164161682129,
"adv/mean_abs_step_conf": 0.7638496160507202,
"adv/ratio_final_to_reasoning": 2.6796956146756967,
"adv/ratio_step_to_reasoning": 2.7143036872239157,
"adv/std_final_conf": 0.9286118745803833,
"adv/std_reasoning": 0.5726398825645447,
"adv/std_step_conf": 0.9354013800621033,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.765625,
"calib/ece": 0.07549019607843138,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.00700406276267862,
"calib/mean_conf": 0.6610588235294117,
"calib/mu_c": 0.6587790697674418,
"calib/mu_w": 0.6657831325301204,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.031019607843137287,
"calib/std_conf": 0.049602898304278,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5780000000000001,
"calib/step_q_c_n": 2070.0,
"calib/step_q_gap": -0.01578964941569272,
"calib/step_q_w": 0.5937896494156928,
"calib/step_q_w_n": 1198.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2138.0,
"completions/max_terminated_length": 2138.0,
"completions/mean_length": 868.57421875,
"completions/mean_terminated_length": 875.4133911132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.14597614109516144,
"kl": 0.0505218505859375,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0084,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018267178907990456,
"mask/share_reasoning": 0.795763373374939,
"mask/share_step_conf": 0.17815694212913513,
"num_tokens": 52146305.0,
"reward": 0.6932387351989746,
"reward_std": 0.19367341697216034,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7717105150222778,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.2811731696128845,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.7423576712608337,
"adv/mean_abs_reasoning": 0.2969627380371094,
"adv/mean_abs_step_conf": 0.7681744694709778,
"adv/ratio_final_to_reasoning": 2.4998344107672743,
"adv/ratio_step_to_reasoning": 2.5867705643762764,
"adv/std_final_conf": 0.9297893643379211,
"adv/std_reasoning": 0.5959508419036865,
"adv/std_step_conf": 0.9360413551330566,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.5078125,
"calib/ece": 0.0816862745098039,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.002643724696356209,
"calib/mean_conf": 0.6724313725490195,
"calib/mu_c": 0.6731052631578947,
"calib/mu_w": 0.6704615384615384,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.004509803921568629,
"calib/std_conf": 0.05080885166897625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5863885530028214,
"calib/step_q_c_n": 2481.0,
"calib/step_q_gap": -0.010909297560126374,
"calib/step_q_w": 0.5972978505629478,
"calib/step_q_w_n": 977.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 900.76953125,
"completions/mean_terminated_length": 904.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 342.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.19842861592769623,
"kl": 0.047882080078125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0151,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017363911494612694,
"mask/share_reasoning": 0.7947203516960144,
"mask/share_step_conf": 0.18400946259498596,
"num_tokens": 52481054.0,
"reward": 0.7063318490982056,
"reward_std": 0.20461004972457886,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.8000777363777161,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.26492974162101746,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.7441630363464355,
"adv/mean_abs_reasoning": 0.26128995418548584,
"adv/mean_abs_step_conf": 0.7661536335945129,
"adv/ratio_final_to_reasoning": 2.8480353891377135,
"adv/ratio_step_to_reasoning": 2.932197052821372,
"adv/std_final_conf": 0.9314100742340088,
"adv/std_reasoning": 0.5725415945053101,
"adv/std_step_conf": 0.93570876121521,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.25,
"calib/ece": 0.08803921568627451,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.007858428700012854,
"calib/mean_conf": 0.6673333333333334,
"calib/mu_c": 0.6642207792207792,
"calib/mu_w": 0.672079207920792,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07572549019607842,
"calib/std_conf": 0.05861762656224533,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5893823116518485,
"calib/step_q_c_n": 2137.0,
"calib/step_q_gap": -0.0010412489040747541,
"calib/step_q_w": 0.5904235605559233,
"calib/step_q_w_n": 1511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 879.234375,
"completions/mean_terminated_length": 882.6824340820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.1824,
"grad_norm": 0.18084461987018585,
"kl": 0.0527496337890625,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0115,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01803310588002205,
"mask/share_reasoning": 0.7983774542808533,
"mask/share_step_conf": 0.17968320846557617,
"num_tokens": 52813034.0,
"reward": 0.6411213278770447,
"reward_std": 0.1809045970439911,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7466551065444946,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.21605640649795532,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.7528913021087646,
"adv/mean_abs_reasoning": 0.309874027967453,
"adv/mean_abs_step_conf": 0.7537387609481812,
"adv/ratio_final_to_reasoning": 2.429668943367668,
"adv/ratio_step_to_reasoning": 2.4324037928966042,
"adv/std_final_conf": 0.9280959963798523,
"adv/std_reasoning": 0.5960396528244019,
"adv/std_step_conf": 0.9357143640518188,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.8828125,
"calib/ece": 0.198740157480315,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02959276980329617,
"calib/mean_conf": 0.6540944881889763,
"calib/mu_c": 0.6488516746411483,
"calib/mu_w": 0.6784444444444445,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.015000000000000001,
"calib/std_conf": 0.055992889462925036,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5778473662380755,
"calib/step_q_c_n": 2411.0,
"calib/step_q_gap": -0.008571017280149529,
"calib/step_q_w": 0.5864183835182251,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 820.38671875,
"completions/mean_terminated_length": 826.846435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.15892651677131653,
"kl": 0.0509033203125,
"learning_rate": 8.055555555555557e-07,
"loss": -0.036,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019890522584319115,
"mask/share_reasoning": 0.7924667596817017,
"mask/share_step_conf": 0.17983026802539825,
"num_tokens": 53126405.0,
"reward": 0.8330913782119751,
"reward_std": 0.19977548718452454,
"rewards/accuracy_reward_step": 0.81640625,
"rewards/final_brier_reward_step": 0.8076265454292297,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.4968373775482178,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.761202871799469,
"adv/mean_abs_reasoning": 0.23559096455574036,
"adv/mean_abs_step_conf": 0.7611009478569031,
"adv/ratio_final_to_reasoning": 3.2310359322773174,
"adv/ratio_step_to_reasoning": 3.2306033013283413,
"adv/std_final_conf": 0.9258071184158325,
"adv/std_reasoning": 0.5226951241493225,
"adv/std_step_conf": 0.9345263838768005,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.8671875,
"calib/ece": 0.12457031249999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.003093838099073598,
"calib/mean_conf": 0.6675390625,
"calib/mu_c": 0.6683246073298429,
"calib/mu_w": 0.6652307692307693,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.023007812500000006,
"calib/std_conf": 0.07051566784496262,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5879166666666666,
"calib/step_q_c_n": 2472.0,
"calib/step_q_gap": -0.005185523114355273,
"calib/step_q_w": 0.5931021897810219,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2083.0,
"completions/max_terminated_length": 2083.0,
"completions/mean_length": 870.73046875,
"completions/mean_terminated_length": 877.5866088867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.16663791239261627,
"kl": 0.0567626953125,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0124,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0201319120824337,
"mask/share_reasoning": 0.7883695960044861,
"mask/share_step_conf": 0.1836860179901123,
"num_tokens": 53452472.0,
"reward": 0.7370842695236206,
"reward_std": 0.1307639330625534,
"rewards/accuracy_reward_step": 0.74609375,
"rewards/final_brier_reward_step": 0.800590991973877,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.32435885071754456,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7598955035209656,
"adv/mean_abs_reasoning": 0.5007636547088623,
"adv/mean_abs_step_conf": 0.7425982356071472,
"adv/ratio_final_to_reasoning": 1.5174733556946325,
"adv/ratio_step_to_reasoning": 1.4829315758526536,
"adv/std_final_conf": 0.9315431714057922,
"adv/std_reasoning": 0.7575818300247192,
"adv/std_step_conf": 0.9360539317131042,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.90234375,
"calib/ece": 0.08381526104417669,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0014946985282483505,
"calib/mean_conf": 0.6689558232931727,
"calib/mu_c": 0.6693820224719101,
"calib/mu_w": 0.6678873239436618,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.018955823293172715,
"calib/std_conf": 0.058698347850569405,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5949143767423337,
"calib/step_q_c_n": 2511.0,
"calib/step_q_gap": -0.006431043104994538,
"calib/step_q_w": 0.6013454198473283,
"calib/step_q_w_n": 1048.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2954.0,
"completions/max_terminated_length": 2954.0,
"completions/mean_length": 896.1875,
"completions/mean_terminated_length": 917.696044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.1856,
"grad_norm": 0.47667044401168823,
"kl": 0.049530029296875,
"learning_rate": 7.5e-07,
"loss": -0.0361,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.017272762954235077,
"mask/share_reasoning": 0.7841352224349976,
"mask/share_step_conf": 0.17515450716018677,
"num_tokens": 53786128.0,
"reward": 0.7226041555404663,
"reward_std": 0.24592873454093933,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7657800912857056,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.34661564230918884,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.7577539682388306,
"adv/mean_abs_reasoning": 0.44572803378105164,
"adv/mean_abs_step_conf": 0.7865190505981445,
"adv/ratio_final_to_reasoning": 1.7000365936396336,
"adv/ratio_step_to_reasoning": 1.7645716468093964,
"adv/std_final_conf": 0.9305837750434875,
"adv/std_reasoning": 0.7013881802558899,
"adv/std_step_conf": 0.9361911416053772,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.7109375,
"calib/ece": 0.15802371541501978,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.04027268093781855,
"calib/mean_conf": 0.6776284584980237,
"calib/mu_c": 0.6602777777777779,
"calib/mu_w": 0.7005504587155964,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1332411067193676,
"calib/std_conf": 0.06925855421827964,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5890109890109889,
"calib/step_q_c_n": 1911.0,
"calib/step_q_gap": -0.02031515654157179,
"calib/step_q_w": 0.6093261455525607,
"calib/step_q_w_n": 1855.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 928.3125,
"completions/mean_terminated_length": 943.0476684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 289.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.17033083736896515,
"kl": 0.04911041259765625,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0705,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018152743577957153,
"mask/share_reasoning": 0.7822288274765015,
"mask/share_step_conf": 0.18399344384670258,
"num_tokens": 54129600.0,
"reward": 0.6329188942909241,
"reward_std": 0.24493342638015747,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7100539207458496,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.24562771618366241,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.7080068588256836,
"adv/mean_abs_reasoning": 0.3450409770011902,
"adv/mean_abs_step_conf": 0.744574785232544,
"adv/ratio_final_to_reasoning": 2.0519500755507116,
"adv/ratio_step_to_reasoning": 2.157931477309652,
"adv/std_final_conf": 0.9148129820823669,
"adv/std_reasoning": 0.6401710510253906,
"adv/std_step_conf": 0.9354899525642395,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.328125,
"calib/ece": 0.06141176470588236,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.004428571428571448,
"calib/mean_conf": 0.6659607843137255,
"calib/mu_c": 0.6645714285714286,
"calib/mu_w": 0.669,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.020549019607843125,
"calib/std_conf": 0.06376856462771426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5853746654772525,
"calib/step_q_c_n": 2242.0,
"calib/step_q_gap": -0.013078325975738991,
"calib/step_q_w": 0.5984529914529915,
"calib/step_q_w_n": 1170.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2261.0,
"completions/max_terminated_length": 2261.0,
"completions/mean_length": 885.89453125,
"completions/mean_terminated_length": 892.8700561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.19338367879390717,
"kl": 0.04975128173828125,
"learning_rate": 6.944444444444446e-07,
"loss": 0.0051,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019442148506641388,
"mask/share_reasoning": 0.7906622886657715,
"mask/share_step_conf": 0.18208308517932892,
"num_tokens": 54460453.0,
"reward": 0.7801279425621033,
"reward_std": 0.1804857850074768,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7752718925476074,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.4490464925765991,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.7342836260795593,
"adv/mean_abs_reasoning": 0.28962162137031555,
"adv/mean_abs_step_conf": 0.7492353916168213,
"adv/ratio_final_to_reasoning": 2.5353204729859953,
"adv/ratio_step_to_reasoning": 2.586945643325555,
"adv/std_final_conf": 0.9285452961921692,
"adv/std_reasoning": 0.595961332321167,
"adv/std_step_conf": 0.9352489113807678,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.16015625,
"calib/ece": 0.10365079365079358,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.03676552106430164,
"calib/mean_conf": 0.6577777777777778,
"calib/mu_c": 0.6449390243902439,
"calib/mu_w": 0.6817045454545455,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05531746031746029,
"calib/std_conf": 0.05584685534749997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5758350730688936,
"calib/step_q_c_n": 1916.0,
"calib/step_q_gap": -0.025871740420438827,
"calib/step_q_w": 0.6017068134893324,
"calib/step_q_w_n": 1453.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2263.0,
"completions/max_terminated_length": 2263.0,
"completions/mean_length": 829.31640625,
"completions/mean_terminated_length": 839.1502075195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.1888,
"grad_norm": 0.16369785368442535,
"kl": 0.0509490966796875,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0189,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019376084208488464,
"mask/share_reasoning": 0.7894700765609741,
"mask/share_step_conf": 0.17943505942821503,
"num_tokens": 54776590.0,
"reward": 0.7116050124168396,
"reward_std": 0.17901837825775146,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7410968542098999,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.357113242149353,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.7370127439498901,
"adv/mean_abs_reasoning": 0.3701658248901367,
"adv/mean_abs_step_conf": 0.7550868391990662,
"adv/ratio_final_to_reasoning": 1.991034002581496,
"adv/ratio_step_to_reasoning": 2.03986102559082,
"adv/std_final_conf": 0.9300718307495117,
"adv/std_reasoning": 0.6611856818199158,
"adv/std_step_conf": 0.9358887076377869,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.171875,
"calib/ece": 0.06717647058823525,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.012167721518987262,
"calib/mean_conf": 0.6750196078431371,
"calib/mu_c": 0.67125,
"calib/mu_w": 0.6834177215189873,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.026000000000000013,
"calib/std_conf": 0.06235178195987427,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.589878508588186,
"calib/step_q_c_n": 2387.0,
"calib/step_q_gap": -0.011475238390057263,
"calib/step_q_w": 0.6013537469782433,
"calib/step_q_w_n": 1241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2166.0,
"completions/max_terminated_length": 2166.0,
"completions/mean_length": 865.33984375,
"completions/mean_terminated_length": 872.153564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.16527439653873444,
"kl": 0.04903411865234375,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0028,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01791071519255638,
"mask/share_reasoning": 0.7861236333847046,
"mask/share_step_conf": 0.18815310299396515,
"num_tokens": 55104189.0,
"reward": 0.7469719648361206,
"reward_std": 0.22839292883872986,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7738183736801147,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.38340672850608826,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.750512957572937,
"adv/mean_abs_reasoning": 0.316341757774353,
"adv/mean_abs_step_conf": 0.7563944458961487,
"adv/ratio_final_to_reasoning": 2.3724751447713675,
"adv/ratio_step_to_reasoning": 2.3910673419083857,
"adv/std_final_conf": 0.9268893003463745,
"adv/std_reasoning": 0.6185594797134399,
"adv/std_step_conf": 0.9358287453651428,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.703125,
"calib/ece": 0.15624000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.008,
"calib/gap": -0.04347124642206612,
"calib/mean_conf": 0.6664800000000001,
"calib/mu_c": 0.6558730158730159,
"calib/mu_w": 0.6993442622950821,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03336,
"calib/std_conf": 0.05700885545246458,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5797626112759644,
"calib/step_q_c_n": 2359.0,
"calib/step_q_gap": -0.032500225973818075,
"calib/step_q_w": 0.6122628372497825,
"calib/step_q_w_n": 1149.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3043.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 876.16796875,
"completions/mean_terminated_length": 893.6215209960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 336.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.2310127168893814,
"kl": 0.04970550537109375,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0686,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018145693466067314,
"mask/share_reasoning": 0.7825173139572144,
"mask/share_step_conf": 0.179805725812912,
"num_tokens": 55434752.0,
"reward": 0.7343745231628418,
"reward_std": 0.21182644367218018,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7697601914405823,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.3560202717781067,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.7362815141677856,
"adv/mean_abs_reasoning": 0.3285897374153137,
"adv/mean_abs_step_conf": 0.7740625143051147,
"adv/ratio_final_to_reasoning": 2.2407319229120626,
"adv/ratio_step_to_reasoning": 2.3557111685650596,
"adv/std_final_conf": 0.9302303194999695,
"adv/std_reasoning": 0.6184530854225159,
"adv/std_step_conf": 0.936010479927063,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 15.328125,
"calib/ece": 0.08861111111111107,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.015873015873015872,
"calib/gap": -0.010345357430306512,
"calib/mean_conf": 0.6882142857142858,
"calib/mu_c": 0.684971098265896,
"calib/mu_w": 0.6953164556962025,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04515873015873011,
"calib/std_conf": 0.06839707911043426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5966389219183512,
"calib/step_q_c_n": 2523.0,
"calib/step_q_gap": -0.020191913199421818,
"calib/step_q_w": 0.616830835117773,
"calib/step_q_w_n": 1401.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 1002.41796875,
"completions/mean_terminated_length": 1022.386474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.192,
"grad_norm": 0.13480441272258759,
"kl": 0.041332244873046875,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0373,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.015691669657826424,
"mask/share_reasoning": 0.7890633344650269,
"mask/share_step_conf": 0.17571374773979187,
"num_tokens": 55795227.0,
"reward": 0.7093750238418579,
"reward_std": 0.20935778319835663,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7635316252708435,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3231871724128723,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.7619613409042358,
"adv/mean_abs_reasoning": 0.2994126081466675,
"adv/mean_abs_step_conf": 0.7267587184906006,
"adv/ratio_final_to_reasoning": 2.544853891159415,
"adv/ratio_step_to_reasoning": 2.427281613119636,
"adv/std_final_conf": 0.9292603135108948,
"adv/std_reasoning": 0.5959325432777405,
"adv/std_step_conf": 0.9358544945716858,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.10546875,
"calib/ece": 0.03640625000000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": -0.0017675657675658707,
"calib/mean_conf": 0.665234375,
"calib/mu_c": 0.6646060606060605,
"calib/mu_w": 0.6663736263736264,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.028554687500000016,
"calib/std_conf": 0.05269257602698292,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5831547064305685,
"calib/step_q_c_n": 2146.0,
"calib/step_q_gap": -0.004893267101276022,
"calib/step_q_w": 0.5880479735318446,
"calib/step_q_w_n": 1209.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1829.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 842.7578125,
"completions/mean_terminated_length": 849.3936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.1750410497188568,
"kl": 0.05321502685546875,
"learning_rate": 5.555555555555555e-07,
"loss": -0.027,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01841357722878456,
"mask/share_reasoning": 0.7919554114341736,
"mask/share_step_conf": 0.18181854486465454,
"num_tokens": 56117237.0,
"reward": 0.6624540686607361,
"reward_std": 0.19779205322265625,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7668741941452026,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.2291276752948761,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.745903730392456,
"adv/mean_abs_reasoning": 0.32951363921165466,
"adv/mean_abs_step_conf": 0.7798632383346558,
"adv/ratio_final_to_reasoning": 2.2636505492670786,
"adv/ratio_step_to_reasoning": 2.366710040289806,
"adv/std_final_conf": 0.9296409487724304,
"adv/std_reasoning": 0.6184464693069458,
"adv/std_step_conf": 0.9356173276901245,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.8125,
"calib/ece": 0.0877734375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.014350877192982447,
"calib/mean_conf": 0.6644140624999999,
"calib/mu_c": 0.6596491228070175,
"calib/mu_w": 0.6739999999999999,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.042109375,
"calib/std_conf": 0.05715631353792942,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5816244822825586,
"calib/step_q_c_n": 2173.0,
"calib/step_q_gap": -0.004220142830359319,
"calib/step_q_w": 0.5858446251129179,
"calib/step_q_w_n": 1107.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1934.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 863.03125,
"completions/mean_terminated_length": 869.8267822265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 403.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.18245761096477509,
"kl": 0.0487518310546875,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0171,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018086858093738556,
"mask/share_reasoning": 0.7957003116607666,
"mask/share_step_conf": 0.17840032279491425,
"num_tokens": 56444333.0,
"reward": 0.6843788623809814,
"reward_std": 0.2277456820011139,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7685683965682983,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.26659566164016724,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.7563251256942749,
"adv/mean_abs_reasoning": 0.45584434270858765,
"adv/mean_abs_step_conf": 0.7536901831626892,
"adv/ratio_final_to_reasoning": 1.6591740970179787,
"adv/ratio_step_to_reasoning": 1.6533937411273492,
"adv/std_final_conf": 0.9308493733406067,
"adv/std_reasoning": 0.7205421924591064,
"adv/std_step_conf": 0.9354673624038696,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 14.375,
"calib/ece": 0.07956862745098046,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.016951846657316527,
"calib/mean_conf": 0.674156862745098,
"calib/mu_c": 0.6695698924731183,
"calib/mu_w": 0.6865217391304348,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01215686274509806,
"calib/std_conf": 0.05470755496775367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5892304702681695,
"calib/step_q_c_n": 2573.0,
"calib/step_q_gap": -0.015195907328939828,
"calib/step_q_w": 0.6044263775971094,
"calib/step_q_w_n": 1107.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2084.0,
"completions/max_terminated_length": 2084.0,
"completions/mean_length": 921.55078125,
"completions/mean_terminated_length": 928.8070678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.1952,
"grad_norm": 0.2044658213853836,
"kl": 0.0491943359375,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0288,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.017305593937635422,
"mask/share_reasoning": 0.7930552959442139,
"mask/share_step_conf": 0.1818266063928604,
"num_tokens": 56786930.0,
"reward": 0.7817938923835754,
"reward_std": 0.21965757012367249,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.786806583404541,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.43224990367889404,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.748545229434967,
"adv/mean_abs_reasoning": 0.3237451910972595,
"adv/mean_abs_step_conf": 0.7547122240066528,
"adv/ratio_final_to_reasoning": 2.3121431607924303,
"adv/ratio_step_to_reasoning": 2.3311920756219733,
"adv/std_final_conf": 0.9301021099090576,
"adv/std_reasoning": 0.6184375882148743,
"adv/std_step_conf": 0.9356794357299805,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.68359375,
"calib/ece": 0.10901185770750993,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.013689839572192497,
"calib/mean_conf": 0.6589723320158102,
"calib/mu_c": 0.6554010695187166,
"calib/mu_w": 0.6690909090909091,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.014426877470355716,
"calib/std_conf": 0.054813114264309196,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.579004424778761,
"calib/step_q_c_n": 2260.0,
"calib/step_q_gap": -0.018432252019617956,
"calib/step_q_w": 0.597436676798379,
"calib/step_q_w_n": 987.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1954.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 865.20703125,
"completions/mean_terminated_length": 878.9405517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.14740177989006042,
"kl": 0.0485382080078125,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0276,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.017934516072273254,
"mask/share_reasoning": 0.7966513633728027,
"mask/share_step_conf": 0.16978907585144043,
"num_tokens": 57113703.0,
"reward": 0.7120828032493591,
"reward_std": 0.19992341101169586,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7831875085830688,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.2972281277179718,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.750738799571991,
"adv/mean_abs_reasoning": 0.2984664738178253,
"adv/mean_abs_step_conf": 0.7430471181869507,
"adv/ratio_final_to_reasoning": 2.5153203640225894,
"adv/ratio_step_to_reasoning": 2.489549692741985,
"adv/std_final_conf": 0.9285163283348083,
"adv/std_reasoning": 0.5960050821304321,
"adv/std_step_conf": 0.9359565377235413,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.265625,
"calib/ece": 0.17133858267716545,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": -0.026795142969056163,
"calib/mean_conf": 0.664251968503937,
"calib/mu_c": 0.656972972972973,
"calib/mu_w": 0.6837681159420291,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05362204724409457,
"calib/std_conf": 0.059697659314424174,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5884972562262558,
"calib/step_q_c_n": 2369.0,
"calib/step_q_gap": -0.0397256588166125,
"calib/step_q_w": 0.6282229150428683,
"calib/step_q_w_n": 1283.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2269.0,
"completions/max_terminated_length": 2269.0,
"completions/mean_length": 883.109375,
"completions/mean_terminated_length": 893.5810546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.16363385319709778,
"kl": 0.042522430419921875,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0004,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.018001873046159744,
"mask/share_reasoning": 0.7912944555282593,
"mask/share_step_conf": 0.17898491024971008,
"num_tokens": 57446699.0,
"reward": 0.7514989972114563,
"reward_std": 0.2056909203529358,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7777429819107056,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.38228633999824524,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.7524923086166382,
"adv/mean_abs_reasoning": 0.2546382546424866,
"adv/mean_abs_step_conf": 0.738852858543396,
"adv/ratio_final_to_reasoning": 2.955142422230082,
"adv/ratio_step_to_reasoning": 2.901578396304786,
"adv/std_final_conf": 0.9246767163276672,
"adv/std_reasoning": 0.5482885241508484,
"adv/std_step_conf": 0.9352895617485046,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.984375,
"calib/ece": 0.16707509881422924,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.015810276679841896,
"calib/gap": -0.044274615774362536,
"calib/mean_conf": 0.6678656126482213,
"calib/mu_c": 0.6570157068062827,
"calib/mu_w": 0.7012903225806453,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.040000000000000036,
"calib/std_conf": 0.06039551559243489,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5798088907353551,
"calib/step_q_c_n": 2407.0,
"calib/step_q_gap": -0.03750568726976011,
"calib/step_q_w": 0.6173145780051152,
"calib/step_q_w_n": 1173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2724.0,
"completions/max_terminated_length": 2724.0,
"completions/mean_length": 889.66796875,
"completions/mean_terminated_length": 903.7897338867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 238.0,
"epoch": 0.1984,
"grad_norm": 0.15272776782512665,
"kl": 0.0468902587890625,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0542,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01797938346862793,
"mask/share_reasoning": 0.7882766723632812,
"mask/share_step_conf": 0.17811891436576843,
"num_tokens": 57779494.0,
"reward": 0.741277813911438,
"reward_std": 0.1670185625553131,
"rewards/accuracy_reward_step": 0.74609375,
"rewards/final_brier_reward_step": 0.7761093378067017,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.36035263538360596,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7613424062728882,
"adv/mean_abs_reasoning": 0.48220592737197876,
"adv/mean_abs_step_conf": 0.772615373134613,
"adv/ratio_final_to_reasoning": 1.5788740101603533,
"adv/ratio_step_to_reasoning": 1.6022519203473196,
"adv/std_final_conf": 0.9324616193771362,
"adv/std_reasoning": 0.7392339110374451,
"adv/std_step_conf": 0.936041533946991,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 15.078125,
"calib/ece": 0.1039759036144578,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.004016064257028112,
"calib/gap": -0.018737546699875596,
"calib/mean_conf": 0.6740160642570281,
"calib/mu_c": 0.6685227272727272,
"calib/mu_w": 0.6872602739726028,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03558232931726907,
"calib/std_conf": 0.06741896625591312,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5895748160261651,
"calib/step_q_c_n": 2446.0,
"calib/step_q_gap": -0.04200227025389147,
"calib/step_q_w": 0.6315770862800566,
"calib/step_q_w_n": 1414.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 897.34375,
"completions/mean_terminated_length": 926.290283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.2161259949207306,
"kl": 0.0449066162109375,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0177,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.017458781599998474,
"mask/share_reasoning": 0.7766166925430298,
"mask/share_step_conf": 0.17467457056045532,
"num_tokens": 58110758.0,
"reward": 0.7140494585037231,
"reward_std": 0.25206685066223145,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7580785155296326,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.33798912167549133,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.7479918003082275,
"adv/mean_abs_reasoning": 0.267370343208313,
"adv/mean_abs_step_conf": 0.769452691078186,
"adv/ratio_final_to_reasoning": 2.7975870148225597,
"adv/ratio_step_to_reasoning": 2.8778535489206885,
"adv/std_final_conf": 0.9274735450744629,
"adv/std_reasoning": 0.5482823848724365,
"adv/std_step_conf": 0.9348469376564026,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.10546875,
"calib/ece": 0.1229644268774703,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.026561677758860647,
"calib/mean_conf": 0.668498023715415,
"calib/mu_c": 0.661043956043956,
"calib/mu_w": 0.6876056338028167,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0360474308300395,
"calib/std_conf": 0.05998778693262936,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5861072164948453,
"calib/step_q_c_n": 2425.0,
"calib/step_q_gap": -0.02619463847985959,
"calib/step_q_w": 0.6123018549747049,
"calib/step_q_w_n": 1186.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2227.0,
"completions/max_terminated_length": 2227.0,
"completions/mean_length": 909.1875,
"completions/mean_terminated_length": 919.9684448242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.11366663873195648,
"kl": 0.046966552734375,
"learning_rate": 3.611111111111111e-07,
"loss": -0.04,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01853599213063717,
"mask/share_reasoning": 0.7887281775474548,
"mask/share_step_conf": 0.18101707100868225,
"num_tokens": 58447582.0,
"reward": 0.7634593844413757,
"reward_std": 0.1651395559310913,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7720566391944885,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.41501832008361816,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.7274742126464844,
"adv/mean_abs_reasoning": 0.3696415424346924,
"adv/mean_abs_step_conf": 0.7473438382148743,
"adv/ratio_final_to_reasoning": 1.9680531789118731,
"adv/ratio_step_to_reasoning": 2.0218069465147135,
"adv/std_final_conf": 0.9266048073768616,
"adv/std_reasoning": 0.6611664295196533,
"adv/std_step_conf": 0.936164140701294,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.984375,
"calib/ece": 0.10409448818897638,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.021564311394820068,
"calib/mean_conf": 0.6561417322834645,
"calib/mu_c": 0.6496045197740112,
"calib/mu_w": 0.6711688311688313,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.031692913385826756,
"calib/std_conf": 0.05016582335833497,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5773882017126547,
"calib/step_q_c_n": 2102.0,
"calib/step_q_gap": -0.017619981593401057,
"calib/step_q_w": 0.5950081833060558,
"calib/step_q_w_n": 1222.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 823.2421875,
"completions/mean_terminated_length": 833.0039672851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.2016,
"grad_norm": 0.21445226669311523,
"kl": 0.04804229736328125,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0545,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01917072758078575,
"mask/share_reasoning": 0.7900059223175049,
"mask/share_step_conf": 0.17910461127758026,
"num_tokens": 58766100.0,
"reward": 0.7019118666648865,
"reward_std": 0.24199704825878143,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7694070339202881,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.2976979613304138,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.7516705393791199,
"adv/mean_abs_reasoning": 0.3387170433998108,
"adv/mean_abs_step_conf": 0.7623640298843384,
"adv/ratio_final_to_reasoning": 2.2191695222489054,
"adv/ratio_step_to_reasoning": 2.250740093360074,
"adv/std_final_conf": 0.9286521673202515,
"adv/std_reasoning": 0.6185595989227295,
"adv/std_step_conf": 0.9362083673477173,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.07421875,
"calib/ece": 0.08423529411764709,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.020256869772998787,
"calib/mean_conf": 0.6749803921568628,
"calib/mu_c": 0.6675925925925926,
"calib/mu_w": 0.6878494623655914,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06196078431372551,
"calib/std_conf": 0.05709211346109072,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5859443631039531,
"calib/step_q_c_n": 2049.0,
"calib/step_q_gap": -0.006744388822086944,
"calib/step_q_w": 0.5926887519260401,
"calib/step_q_w_n": 1298.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2104.0,
"completions/max_terminated_length": 2104.0,
"completions/mean_length": 935.9375,
"completions/mean_terminated_length": 943.3070678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.1718294471502304,
"kl": 0.0427703857421875,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0173,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0172400064766407,
"mask/share_reasoning": 0.7973253726959229,
"mask/share_step_conf": 0.17762216925621033,
"num_tokens": 59111308.0,
"reward": 0.6847772598266602,
"reward_std": 0.23699063062667847,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7511374950408936,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.2926359176635742,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.7656644582748413,
"adv/mean_abs_reasoning": 0.2647988796234131,
"adv/mean_abs_step_conf": 0.7759451270103455,
"adv/ratio_final_to_reasoning": 2.8914943271804634,
"adv/ratio_step_to_reasoning": 2.9303187691498738,
"adv/std_final_conf": 0.9286467432975769,
"adv/std_reasoning": 0.5482805371284485,
"adv/std_step_conf": 0.9354783296585083,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.5234375,
"calib/ece": 0.08047430830039534,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.01485738255033553,
"calib/mean_conf": 0.66,
"calib/mu_c": 0.6538926174496644,
"calib/mu_w": 0.66875,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07577075098814239,
"calib/std_conf": 0.04730691983242642,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804259850905218,
"calib/step_q_c_n": 1878.0,
"calib/step_q_gap": -0.016499519959983222,
"calib/step_q_w": 0.596925505050505,
"calib/step_q_w_n": 1584.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2201.0,
"completions/max_terminated_length": 2201.0,
"completions/mean_length": 831.8828125,
"completions/mean_terminated_length": 841.7470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 253.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.1342456191778183,
"kl": 0.05359649658203125,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0491,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019521009176969528,
"mask/share_reasoning": 0.7809332609176636,
"mask/share_step_conf": 0.1878269612789154,
"num_tokens": 59428438.0,
"reward": 0.6383215188980103,
"reward_std": 0.16677504777908325,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7347148656845093,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.22786563634872437,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.7587395906448364,
"adv/mean_abs_reasoning": 0.2984263002872467,
"adv/mean_abs_step_conf": 0.7594149708747864,
"adv/ratio_final_to_reasoning": 2.5424689107981453,
"adv/ratio_step_to_reasoning": 2.544732049902507,
"adv/std_final_conf": 0.926181972026825,
"adv/std_reasoning": 0.5960554480552673,
"adv/std_step_conf": 0.9353460073471069,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 12.73046875,
"calib/ece": 0.14039525691699598,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.038947330132401925,
"calib/mean_conf": 0.6573913043478262,
"calib/mu_c": 0.6452298850574714,
"calib/mu_w": 0.6841772151898733,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.055019762845849786,
"calib/std_conf": 0.05948654681899828,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5767428003972195,
"calib/step_q_c_n": 2014.0,
"calib/step_q_gap": -0.027241135345752454,
"calib/step_q_w": 0.6039839357429719,
"calib/step_q_w_n": 1245.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 2512.0,
"completions/mean_length": 850.34375,
"completions/mean_terminated_length": 860.4269409179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.2048,
"grad_norm": 0.16092029213905334,
"kl": 0.05052947998046875,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0241,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.020145608112215996,
"mask/share_reasoning": 0.7876855134963989,
"mask/share_step_conf": 0.18045015633106232,
"num_tokens": 59751102.0,
"reward": 0.7235573530197144,
"reward_std": 0.19452857971191406,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.755107045173645,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.35841381549835205,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7481487989425659,
"adv/mean_abs_reasoning": 0.38850438594818115,
"adv/mean_abs_step_conf": 0.7433756589889526,
"adv/ratio_final_to_reasoning": 1.925715193965286,
"adv/ratio_step_to_reasoning": 1.9134292581399694,
"adv/std_final_conf": 0.9300759434700012,
"adv/std_reasoning": 0.6815869808197021,
"adv/std_step_conf": 0.9363304972648621,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.76171875,
"calib/ece": 0.08035714285714282,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.011904761904761904,
"calib/gap": -0.03173860991020938,
"calib/mean_conf": 0.6730555555555556,
"calib/mu_c": 0.6608387096774194,
"calib/mu_w": 0.6925773195876288,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06916666666666664,
"calib/std_conf": 0.06814280121938554,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5856571978815599,
"calib/step_q_c_n": 2077.0,
"calib/step_q_gap": -0.02949556357554939,
"calib/step_q_w": 0.6151527614571093,
"calib/step_q_w_n": 1702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2420.0,
"completions/max_terminated_length": 2420.0,
"completions/mean_length": 882.81640625,
"completions/mean_terminated_length": 896.8294067382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.16130997240543365,
"kl": 0.046909332275390625,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0396,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01840098574757576,
"mask/share_reasoning": 0.7815666198730469,
"mask/share_step_conf": 0.18440741300582886,
"num_tokens": 60082815.0,
"reward": 0.6326255798339844,
"reward_std": 0.2460542917251587,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7286441326141357,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.21863830089569092,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.7442784905433655,
"adv/mean_abs_reasoning": 0.22733885049819946,
"adv/mean_abs_step_conf": 0.769699215888977,
"adv/ratio_final_to_reasoning": 3.2738728506470576,
"adv/ratio_step_to_reasoning": 3.3856915094020548,
"adv/std_final_conf": 0.9278947710990906,
"adv/std_reasoning": 0.4959554374217987,
"adv/std_step_conf": 0.9358815550804138,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.81640625,
"calib/ece": 0.11011718749999992,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0078125,
"calib/gap": -0.022809941520468113,
"calib/mean_conf": 0.6673828125000001,
"calib/mu_c": 0.660611111111111,
"calib/mu_w": 0.6834210526315792,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0371875,
"calib/std_conf": 0.059900519234726536,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.582438704028021,
"calib/step_q_c_n": 2284.0,
"calib/step_q_gap": -0.016707345453224054,
"calib/step_q_w": 0.5991460494812451,
"calib/step_q_w_n": 1253.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1961.0,
"completions/max_terminated_length": 1961.0,
"completions/mean_length": 863.06640625,
"completions/mean_terminated_length": 869.8621826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.15819790959358215,
"kl": 0.053924560546875,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0045,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01844419352710247,
"mask/share_reasoning": 0.7916795611381531,
"mask/share_step_conf": 0.1820637583732605,
"num_tokens": 60409704.0,
"reward": 0.7222098112106323,
"reward_std": 0.1835189312696457,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7768715023994446,
"rewards/format_reward_step": 1.0,
"rewards/step_correlation_reward": 0.326923131942749,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.7512425184249878,
"adv/mean_abs_reasoning": 0.3248511850833893,
"adv/mean_abs_step_conf": 0.7509489059448242,
"adv/ratio_final_to_reasoning": 2.31257435072045,
"adv/ratio_step_to_reasoning": 2.3116705138448417,
"adv/std_final_conf": 0.9297266602516174,
"adv/std_reasoning": 0.6185224056243896,
"adv/std_step_conf": 0.9359630346298218,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.57421875,
"calib/ece": 0.07638888888888887,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.025153632280601745,
"calib/mean_conf": 0.6701190476190476,
"calib/mu_c": 0.6618343195266272,
"calib/mu_w": 0.6869879518072289,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03793650793650793,
"calib/std_conf": 0.06463255201556993,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5865831435079727,
"calib/step_q_c_n": 2195.0,
"calib/step_q_gap": -0.0378113877420273,
"calib/step_q_w": 0.62439453125,
"calib/step_q_w_n": 1536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 881.12109375,
"completions/mean_terminated_length": 895.107177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.208,
"grad_norm": 0.18600162863731384,
"kl": 0.053314208984375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0785,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018634159117937088,
"mask/share_reasoning": 0.7847827672958374,
"mask/share_step_conf": 0.18095804750919342,
"num_tokens": 60741255.0,
"reward": 0.6997027397155762,
"reward_std": 0.19091691076755524,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7518917918205261,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3186074495315552,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.74583899974823,
"adv/mean_abs_reasoning": 0.23860622942447662,
"adv/mean_abs_step_conf": 0.760269820690155,
"adv/ratio_final_to_reasoning": 3.1258152880048846,
"adv/ratio_step_to_reasoning": 3.186294936741351,
"adv/std_final_conf": 0.9270501136779785,
"adv/std_reasoning": 0.5227808356285095,
"adv/std_step_conf": 0.9355534315109253,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.80859375,
"calib/ece": 0.094,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.006836344151867846,
"calib/mean_conf": 0.6485882352941176,
"calib/mu_c": 0.6466847826086957,
"calib/mu_w": 0.6535211267605635,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010509803921568622,
"calib/std_conf": 0.03978825152769234,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5722817836812145,
"calib/step_q_c_n": 2108.0,
"calib/step_q_gap": -0.007597997739550388,
"calib/step_q_w": 0.5798797814207649,
"calib/step_q_w_n": 915.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1885.0,
"completions/max_terminated_length": 1885.0,
"completions/mean_length": 740.484375,
"completions/mean_terminated_length": 746.31494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.15738339722156525,
"kl": 0.0513763427734375,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0058,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020728375762701035,
"mask/share_reasoning": 0.7827023267745972,
"mask/share_step_conf": 0.1887567937374115,
"num_tokens": 61033363.0,
"reward": 0.7170915603637695,
"reward_std": 0.20412716269493103,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7863527536392212,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.30486172437667847,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.7726929187774658,
"adv/mean_abs_reasoning": 0.4176172614097595,
"adv/mean_abs_step_conf": 0.7945266962051392,
"adv/ratio_final_to_reasoning": 1.8502418127284055,
"adv/ratio_step_to_reasoning": 1.9025236014503768,
"adv/std_final_conf": 0.9308398365974426,
"adv/std_reasoning": 0.6816394925117493,
"adv/std_step_conf": 0.9358987212181091,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.921875,
"calib/ece": 0.09376470588235293,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.01872598162071848,
"calib/mean_conf": 0.6648235294117646,
"calib/mu_c": 0.6586549707602339,
"calib/mu_w": 0.6773809523809524,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.044,
"calib/std_conf": 0.060149065118271595,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5869473684210527,
"calib/step_q_c_n": 2280.0,
"calib/step_q_gap": -0.014867273323495689,
"calib/step_q_w": 0.6018146417445484,
"calib/step_q_w_n": 1284.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2454.0,
"completions/max_terminated_length": 2454.0,
"completions/mean_length": 862.703125,
"completions/mean_terminated_length": 869.4960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 333.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.20018146932125092,
"kl": 0.047637939453125,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0118,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0184207484126091,
"mask/share_reasoning": 0.7925665378570557,
"mask/share_step_conf": 0.18120017647743225,
"num_tokens": 61359271.0,
"reward": 0.7343250513076782,
"reward_std": 0.2549676299095154,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7641792893409729,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.3716582655906677,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.7752957344055176,
"adv/mean_abs_reasoning": 0.3700793981552124,
"adv/mean_abs_step_conf": 0.7856580018997192,
"adv/ratio_final_to_reasoning": 2.0949443234890808,
"adv/ratio_step_to_reasoning": 2.122944443317031,
"adv/std_final_conf": 0.9272284507751465,
"adv/std_reasoning": 0.6402872800827026,
"adv/std_step_conf": 0.936040997505188,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.84765625,
"calib/ece": 0.09941176470588234,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0008826358826359293,
"calib/mean_conf": 0.6552549019607842,
"calib/mu_c": 0.655026455026455,
"calib/mu_w": 0.6559090909090909,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.006745098039215728,
"calib/std_conf": 0.053466442888848206,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5803194619588062,
"calib/step_q_c_n": 2379.0,
"calib/step_q_gap": -0.00581240617306189,
"calib/step_q_w": 0.5861318681318681,
"calib/step_q_w_n": 910.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2314.0,
"completions/max_terminated_length": 2314.0,
"completions/mean_length": 808.09375,
"completions/mean_terminated_length": 814.4566650390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.2112,
"grad_norm": 0.1991894245147705,
"kl": 0.04837799072265625,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0257,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020755808800458908,
"mask/share_reasoning": 0.7868603467941284,
"mask/share_step_conf": 0.18457132577896118,
"num_tokens": 61671527.0,
"reward": 0.7754865884780884,
"reward_std": 0.22402167320251465,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7944706678390503,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.40962737798690796,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.7386778593063354,
"adv/mean_abs_reasoning": 0.3966127038002014,
"adv/mean_abs_step_conf": 0.7696336507797241,
"adv/ratio_final_to_reasoning": 1.8624664621899092,
"adv/ratio_step_to_reasoning": 1.9405168906728631,
"adv/std_final_conf": 0.9307670593261719,
"adv/std_reasoning": 0.6816592216491699,
"adv/std_step_conf": 0.9359711408615112,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.48046875,
"calib/ece": 0.19595238095238102,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.036653708133971286,
"calib/mean_conf": 0.6711111111111111,
"calib/mu_c": 0.6600568181818182,
"calib/mu_w": 0.6967105263157894,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08432539682539683,
"calib/std_conf": 0.06608943221988206,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5846520146520147,
"calib/step_q_c_n": 2184.0,
"calib/step_q_gap": -0.021898893003865383,
"calib/step_q_w": 0.60655090765588,
"calib/step_q_w_n": 1267.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2511.0,
"completions/max_terminated_length": 2511.0,
"completions/mean_length": 898.6640625,
"completions/mean_terminated_length": 912.9286499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 336.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.16708968579769135,
"kl": 0.053680419921875,
"learning_rate": 5.555555555555556e-08,
"loss": -0.0758,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018384799361228943,
"mask/share_reasoning": 0.7842874526977539,
"mask/share_step_conf": 0.18170276284217834,
"num_tokens": 62005785.0,
"reward": 0.7263771891593933,
"reward_std": 0.23604628443717957,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7568007707595825,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.3615786135196686,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.7753833532333374,
"adv/mean_abs_reasoning": 0.21887263655662537,
"adv/mean_abs_step_conf": 0.7630901336669922,
"adv/ratio_final_to_reasoning": 3.5426235340877574,
"adv/ratio_step_to_reasoning": 3.4864574469982696,
"adv/std_final_conf": 0.9273792505264282,
"adv/std_reasoning": 0.4960128366947174,
"adv/std_step_conf": 0.935178816318512,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 14.609375,
"calib/ece": 0.22654618473895588,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.004016064257028112,
"calib/gap": -0.06397883597883591,
"calib/mean_conf": 0.669437751004016,
"calib/mu_c": 0.6540211640211641,
"calib/mu_w": 0.718,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06847389558232936,
"calib/std_conf": 0.06724412412605409,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5762603757099171,
"calib/step_q_c_n": 2289.0,
"calib/step_q_gap": -0.07003872835624414,
"calib/step_q_w": 0.6462991040661612,
"calib/step_q_w_n": 1451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2642.0,
"completions/max_terminated_length": 2642.0,
"completions/mean_length": 882.26171875,
"completions/mean_terminated_length": 907.064208984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.14146360754966736,
"kl": 0.046146392822265625,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0666,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01773788221180439,
"mask/share_reasoning": 0.7792631387710571,
"mask/share_step_conf": 0.17565517127513885,
"num_tokens": 62339692.0,
"reward": 0.7361253499984741,
"reward_std": 0.14766019582748413,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7597870826721191,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.3702760934829712,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.03823093850282021,
"train_runtime": 13687.4621,
"train_samples_per_second": 3.741,
"train_steps_per_second": 0.015
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 62339692,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}