Files
PureRL-1.5B-v7-s2-corr-mask…/trainer_state.json
ModelHub XC 5fdbdd6778 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-corr-maskon-afew
Source: Original Platform
2026-06-04 01:26:18 +08:00

12243 lines
504 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.47760647535324097,
"adv/mean_abs_reasoning": 0.4569147527217865,
"adv/mean_abs_step_conf": 0.7666968107223511,
"adv/ratio_final_to_reasoning": 1.0452857398632815,
"adv/ratio_step_to_reasoning": 1.677986552535741,
"adv/std_final_conf": 0.7227410674095154,
"adv/std_reasoning": 0.7206857204437256,
"adv/std_step_conf": 0.9354395866394043,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5086206896551725,
"calib/avg_num_step_conf": 7.875,
"calib/ece": 0.2888991935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001713264989126051,
"calib/mean_conf": 0.9905120967741936,
"calib/mu_c": 0.9905632183908043,
"calib/mu_w": 0.9903918918918917,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2888991935483871,
"calib/std_conf": 0.0021794159006610276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9119477557027226,
"calib/step_q_c_n": 1359.0,
"calib/step_q_gap": 0.0056311651395566376,
"calib/step_q_w": 0.9063165905631659,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2494.0,
"completions/max_terminated_length": 2494.0,
"completions/mean_length": 755.49609375,
"completions/mean_terminated_length": 776.7349243164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.025545112788677216,
"kl": 0.0005849599838256836,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.084,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018737709149718285,
"mask/share_reasoning": 0.845859944820404,
"mask/share_step_conf": 0.10805858671665192,
"num_tokens": 300991.0,
"reward": 0.7390083074569702,
"reward_std": 0.33157801628112793,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.6851503849029541,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.46317872405052185,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.437887966632843,
"adv/mean_abs_reasoning": 0.4207462966442108,
"adv/mean_abs_step_conf": 0.7360875010490417,
"adv/ratio_final_to_reasoning": 1.0407411072310102,
"adv/ratio_step_to_reasoning": 1.7494806417071997,
"adv/std_final_conf": 0.6832791566848755,
"adv/std_reasoning": 0.6817297339439392,
"adv/std_step_conf": 0.9336206316947937,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4872611464968153,
"calib/avg_num_step_conf": 7.6953125,
"calib/ece": 0.36465737051792824,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024481637078155316,
"calib/mean_conf": 0.9901553784860557,
"calib/mu_c": 0.990063694267516,
"calib/mu_w": 0.9903085106382975,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36465737051792824,
"calib/std_conf": 0.001222205307190084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9075405636208369,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": -0.003804868168900244,
"calib/step_q_w": 0.9113454317897371,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 840.640625,
"completions/mean_terminated_length": 850.6087036132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 466.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.023035092279314995,
"kl": 0.0016820430755615234,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0122,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01788702979683876,
"mask/share_reasoning": 0.8706268668174744,
"mask/share_step_conf": 0.09976735711097717,
"num_tokens": 619483.0,
"reward": 0.6397823095321655,
"reward_std": 0.3136184811592102,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6203019618988037,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3405126631259918,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.4370732605457306,
"adv/mean_abs_reasoning": 0.42643219232559204,
"adv/mean_abs_step_conf": 0.7442405223846436,
"adv/ratio_final_to_reasoning": 1.0249537169370502,
"adv/ratio_step_to_reasoning": 1.745272837695135,
"adv/std_final_conf": 0.7216770648956299,
"adv/std_reasoning": 0.7205691337585449,
"adv/std_step_conf": 0.9328770041465759,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4994155891591789,
"calib/avg_num_step_conf": 7.53125,
"calib/ece": 0.31421999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -3.9812988530729676e-05,
"calib/mean_conf": 0.99022,
"calib/mu_c": 0.9902071005917161,
"calib/mu_w": 0.9902469135802469,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31421999999999994,
"calib/std_conf": 0.0014323407415835114,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9115477145148358,
"calib/step_q_c_n": 1247.0,
"calib/step_q_gap": -0.00036124290072947485,
"calib/step_q_w": 0.9119089574155653,
"calib/step_q_w_n": 681.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2691.0,
"completions/max_terminated_length": 2691.0,
"completions/mean_length": 793.375,
"completions/mean_terminated_length": 809.1793212890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.0032,
"grad_norm": 0.023815318942070007,
"kl": 0.0005033016204833984,
"learning_rate": 7.5e-07,
"loss": -0.0902,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018328379839658737,
"mask/share_reasoning": 0.8629274368286133,
"mask/share_step_conf": 0.09921293705701828,
"num_tokens": 927843.0,
"reward": 0.6633919477462769,
"reward_std": 0.3260177969932556,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6661549806594849,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.3332850933074951,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.39261171221733093,
"adv/mean_abs_reasoning": 0.39031437039375305,
"adv/mean_abs_step_conf": 0.7463158369064331,
"adv/ratio_final_to_reasoning": 1.0058858755860316,
"adv/ratio_step_to_reasoning": 1.9120890582469259,
"adv/std_final_conf": 0.6817169785499573,
"adv/std_reasoning": 0.6815755367279053,
"adv/std_step_conf": 0.9353012442588806,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5027624309392266,
"calib/avg_num_step_conf": 7.88671875,
"calib/ece": 0.2689243027888446,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.524861878469789e-05,
"calib/mean_conf": 0.9900398406374502,
"calib/mu_c": 0.9900552486187845,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2689243027888446,
"calib/std_conf": 0.0006299357888781637,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9102844638949672,
"calib/step_q_c_n": 1371.0,
"calib/step_q_gap": -0.0030951657346623973,
"calib/step_q_w": 0.9133796296296296,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 790.19921875,
"completions/mean_terminated_length": 796.4212646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.02493300288915634,
"kl": 0.0005279183387756348,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0252,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018840719014406204,
"mask/share_reasoning": 0.8683602809906006,
"mask/share_step_conf": 0.10498643666505814,
"num_tokens": 1236302.0,
"reward": 0.7225152254104614,
"reward_std": 0.31173262000083923,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.71240234375,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3951280117034912,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.42183569073677063,
"adv/mean_abs_reasoning": 0.39110061526298523,
"adv/mean_abs_step_conf": 0.7385746836662292,
"adv/ratio_final_to_reasoning": 1.078586108725803,
"adv/ratio_step_to_reasoning": 1.8884518582758922,
"adv/std_final_conf": 0.6998975276947021,
"adv/std_reasoning": 0.6815629005432129,
"adv/std_step_conf": 0.9349147081375122,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4991792929292929,
"calib/avg_num_step_conf": 7.78515625,
"calib/ece": 0.46246031746031757,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.008151515151515132,
"calib/mean_conf": 0.9862698412698413,
"calib/mu_c": 0.9901515151515151,
"calib/mu_w": 0.982,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46246031746031757,
"calib/std_conf": 0.06226841701149678,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9147625354777673,
"calib/step_q_c_n": 1057.0,
"calib/step_q_gap": 0.01945270641793828,
"calib/step_q_w": 0.895309829059829,
"calib/step_q_w_n": 936.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 832.88671875,
"completions/mean_terminated_length": 839.4448852539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.037833891808986664,
"kl": 0.0005629062652587891,
"learning_rate": 1.25e-06,
"loss": 0.0065,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018345676362514496,
"mask/share_reasoning": 0.8695776462554932,
"mask/share_step_conf": 0.10426412522792816,
"num_tokens": 1556209.0,
"reward": 0.44701236486434937,
"reward_std": 0.3320618271827698,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.528497576713562,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.06552702188491821,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.25368258357048035,
"adv/mean_abs_reasoning": 0.24287119507789612,
"adv/mean_abs_step_conf": 0.7538926601409912,
"adv/ratio_final_to_reasoning": 1.0445149063029755,
"adv/ratio_step_to_reasoning": 3.1040842859079896,
"adv/std_final_conf": 0.5490252375602722,
"adv/std_reasoning": 0.5482149720191956,
"adv/std_step_conf": 0.934939980506897,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5114931610942249,
"calib/avg_num_step_conf": 8.203125,
"calib/ece": 0.43260869565217386,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004989868287736954,
"calib/mean_conf": 0.9899209486166007,
"calib/mu_c": 0.9901418439716309,
"calib/mu_w": 0.9896428571428572,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43260869565217386,
"calib/std_conf": 0.002666153668622935,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.912532637075718,
"calib/step_q_c_n": 1149.0,
"calib/step_q_gap": 0.00033495043008180403,
"calib/step_q_w": 0.9121976866456362,
"calib/step_q_w_n": 951.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1500.0,
"completions/max_terminated_length": 1500.0,
"completions/mean_length": 734.64453125,
"completions/mean_terminated_length": 743.3557739257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 461.0,
"epoch": 0.0064,
"grad_norm": 0.02843756601214409,
"kl": 0.0022296905517578125,
"learning_rate": 1.5e-06,
"loss": -0.0273,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019643045961856842,
"mask/share_reasoning": 0.8531935214996338,
"mask/share_step_conf": 0.11544472724199295,
"num_tokens": 1850230.0,
"reward": 0.516380786895752,
"reward_std": 0.23685571551322937,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5597362518310547,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.1652126908302307,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.4957495331764221,
"adv/mean_abs_reasoning": 0.46634334325790405,
"adv/mean_abs_step_conf": 0.7799911499023438,
"adv/ratio_final_to_reasoning": 1.0630569522298412,
"adv/ratio_step_to_reasoning": 1.6725684223415227,
"adv/std_final_conf": 0.7390549778938293,
"adv/std_reasoning": 0.7207476496696472,
"adv/std_step_conf": 0.9354566335678101,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.49412020905923343,
"calib/avg_num_step_conf": 7.52734375,
"calib/ece": 0.32886693548387114,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00012224157955864623,
"calib/mean_conf": 0.9901572580645163,
"calib/mu_c": 0.9901158536585364,
"calib/mu_w": 0.990238095238095,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32886693548387114,
"calib/std_conf": 0.0012294552548691184,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9107335473515248,
"calib/step_q_c_n": 1246.0,
"calib/step_q_gap": 0.004154986411730399,
"calib/step_q_w": 0.9065785609397944,
"calib/step_q_w_n": 681.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 839.84765625,
"completions/mean_terminated_length": 849.8063354492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.03482392057776451,
"kl": 0.000522613525390625,
"learning_rate": 1.75e-06,
"loss": -0.0187,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.017592590302228928,
"mask/share_reasoning": 0.8731827139854431,
"mask/share_step_conf": 0.09750597178936005,
"num_tokens": 2172655.0,
"reward": 0.6363974809646606,
"reward_std": 0.3690088987350464,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6468581557273865,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.304843008518219,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.42081981897354126,
"adv/mean_abs_reasoning": 0.404765248298645,
"adv/mean_abs_step_conf": 0.7461205124855042,
"adv/ratio_final_to_reasoning": 1.0396639057883024,
"adv/ratio_step_to_reasoning": 1.843341333332548,
"adv/std_final_conf": 0.7008998990058899,
"adv/std_reasoning": 0.701344907283783,
"adv/std_step_conf": 0.9360982775688171,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5083723137414412,
"calib/avg_num_step_conf": 7.7265625,
"calib/ece": 0.3893951612903227,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00016744627482878638,
"calib/mean_conf": 0.990201612903226,
"calib/mu_c": 0.9902684563758388,
"calib/mu_w": 0.99010101010101,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3893951612903227,
"calib/std_conf": 0.0014055181498333381,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9123884514435695,
"calib/step_q_c_n": 1143.0,
"calib/step_q_gap": 0.013969289766922688,
"calib/step_q_w": 0.8984191616766468,
"calib/step_q_w_n": 835.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2611.0,
"completions/max_terminated_length": 2611.0,
"completions/mean_length": 851.25,
"completions/mean_terminated_length": 861.3439331054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.024893108755350113,
"kl": 0.0005701184272766113,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0264,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01763007789850235,
"mask/share_reasoning": 0.872861385345459,
"mask/share_step_conf": 0.09778980910778046,
"num_tokens": 2497087.0,
"reward": 0.5612791776657104,
"reward_std": 0.3177064061164856,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5895148515701294,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.22366851568222046,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.5682142972946167,
"adv/mean_abs_reasoning": 0.5364242792129517,
"adv/mean_abs_step_conf": 0.7655105590820312,
"adv/ratio_final_to_reasoning": 1.0592628248078326,
"adv/ratio_step_to_reasoning": 1.4270617284609075,
"adv/std_final_conf": 0.7919481992721558,
"adv/std_reasoning": 0.7756170630455017,
"adv/std_step_conf": 0.9360333681106567,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.4805194805194805,
"calib/avg_num_step_conf": 7.8125,
"calib/ece": 0.31365546218487383,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00038961038961016214,
"calib/mean_conf": 0.990126050420168,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.99038961038961,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31365546218487383,
"calib/std_conf": 0.0011156233653236776,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9011468970934798,
"calib/step_q_c_n": 1273.0,
"calib/step_q_gap": -0.007331782411334364,
"calib/step_q_w": 0.9084786795048142,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2894.0,
"completions/max_terminated_length": 2894.0,
"completions/mean_length": 811.17578125,
"completions/mean_terminated_length": 840.7327880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.0096,
"grad_norm": 0.018573205918073654,
"kl": 0.0006935596466064453,
"learning_rate": 2.25e-06,
"loss": -0.1026,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.01758037693798542,
"mask/share_reasoning": 0.8479781150817871,
"mask/share_step_conf": 0.09928528964519501,
"num_tokens": 2812284.0,
"reward": 0.6472536325454712,
"reward_std": 0.39420419931411743,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6345956921577454,
"rewards/format_reward_step": 0.9296875,
"rewards/step_correlation_reward": 0.34819287061691284,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.5329347848892212,
"adv/mean_abs_reasoning": 0.5287434458732605,
"adv/mean_abs_step_conf": 0.7661516070365906,
"adv/ratio_final_to_reasoning": 1.0079269805586684,
"adv/ratio_step_to_reasoning": 1.449004452000785,
"adv/std_final_conf": 0.7758122086524963,
"adv/std_reasoning": 0.7754489779472351,
"adv/std_step_conf": 0.9358128905296326,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4855769230769231,
"calib/avg_num_step_conf": 7.62890625,
"calib/ece": 0.40777510040160647,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024999999999997247,
"calib/mean_conf": 0.9901044176706828,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9902500000000001,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40777510040160647,
"calib/std_conf": 0.0009554013445488028,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9091376146788991,
"calib/step_q_c_n": 1090.0,
"calib/step_q_gap": 0.0028560387808688192,
"calib/step_q_w": 0.9062815758980303,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2905.0,
"completions/max_terminated_length": 2905.0,
"completions/mean_length": 834.94140625,
"completions/mean_terminated_length": 851.57373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.023835062980651855,
"kl": 0.0006435513496398926,
"learning_rate": 2.5e-06,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01767464354634285,
"mask/share_reasoning": 0.8661353588104248,
"mask/share_step_conf": 0.09665870666503906,
"num_tokens": 3132829.0,
"reward": 0.5643854737281799,
"reward_std": 0.4170411229133606,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5742319822311401,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.24672642350196838,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.4380926489830017,
"adv/mean_abs_reasoning": 0.37450408935546875,
"adv/mean_abs_step_conf": 0.7518080472946167,
"adv/ratio_final_to_reasoning": 1.169794032788722,
"adv/ratio_step_to_reasoning": 2.007476202966162,
"adv/std_final_conf": 0.7370259165763855,
"adv/std_reasoning": 0.7013659477233887,
"adv/std_step_conf": 0.9356460571289062,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5035284350352844,
"calib/avg_num_step_conf": 7.5546875,
"calib/ece": 0.39430612244897956,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.770582537709899e-05,
"calib/mean_conf": 0.9902244897959184,
"calib/mu_c": 0.9902397260273972,
"calib/mu_w": 0.9902020202020201,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39430612244897956,
"calib/std_conf": 0.001446534297687295,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.912566371681416,
"calib/step_q_c_n": 1130.0,
"calib/step_q_gap": 0.001235525910271651,
"calib/step_q_w": 0.9113308457711443,
"calib/step_q_w_n": 804.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 819.5546875,
"completions/mean_terminated_length": 842.5943603515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.02992285043001175,
"kl": 0.0006269216537475586,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0341,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017518922686576843,
"mask/share_reasoning": 0.8593878746032715,
"mask/share_step_conf": 0.09574948251247406,
"num_tokens": 3447115.0,
"reward": 0.5951333045959473,
"reward_std": 0.29580140113830566,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5777971744537354,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.3070007264614105,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.42120277881622314,
"adv/mean_abs_reasoning": 0.39798927307128906,
"adv/mean_abs_step_conf": 0.7613787651062012,
"adv/ratio_final_to_reasoning": 1.05832696335204,
"adv/ratio_step_to_reasoning": 1.9130635336742372,
"adv/std_final_conf": 0.682644784450531,
"adv/std_reasoning": 0.6817029714584351,
"adv/std_step_conf": 0.9329316020011902,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.511436658506732,
"calib/avg_num_step_conf": 7.9453125,
"calib/ece": 0.2926572580645159,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9959677419354839,
"calib/gap": 0.013121481028151805,
"calib/mean_conf": 0.9862056451612904,
"calib/mu_c": 0.9902267441860464,
"calib/mu_w": 0.9771052631578946,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2926572580645159,
"calib/std_conf": 0.06276590249904203,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9126993416239942,
"calib/step_q_c_n": 1367.0,
"calib/step_q_gap": 0.0034339743076523543,
"calib/step_q_w": 0.9092653673163419,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1637.0,
"completions/max_terminated_length": 1637.0,
"completions/mean_length": 754.33984375,
"completions/mean_terminated_length": 778.67333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.0128,
"grad_norm": 0.035631634294986725,
"kl": 0.0007064938545227051,
"learning_rate": 3e-06,
"loss": -0.0519,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01888711005449295,
"mask/share_reasoning": 0.8392666578292847,
"mask/share_step_conf": 0.11059625446796417,
"num_tokens": 3744402.0,
"reward": 0.7010163068771362,
"reward_std": 0.2875725030899048,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6814679503440857,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.392439603805542,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.485071063041687,
"adv/mean_abs_reasoning": 0.46181046962738037,
"adv/mean_abs_step_conf": 0.7558674812316895,
"adv/ratio_final_to_reasoning": 1.0503682678157444,
"adv/ratio_step_to_reasoning": 1.6367482570102276,
"adv/std_final_conf": 0.7590910792350769,
"adv/std_reasoning": 0.7392831444740295,
"adv/std_step_conf": 0.9358668327331543,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5147058823529412,
"calib/avg_num_step_conf": 7.84765625,
"calib/ece": 0.3279681274900399,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.002177888022678931,
"calib/mean_conf": 0.9893227091633466,
"calib/mu_c": 0.9900602409638553,
"calib/mu_w": 0.9878823529411763,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3279681274900399,
"calib/std_conf": 0.008030038366434497,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9123665338645418,
"calib/step_q_c_n": 1255.0,
"calib/step_q_gap": 0.007976613440138669,
"calib/step_q_w": 0.9043899204244031,
"calib/step_q_w_n": 754.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1690.0,
"completions/max_terminated_length": 1690.0,
"completions/mean_length": 787.69140625,
"completions/mean_terminated_length": 800.1945190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.03413955122232437,
"kl": 0.0016703009605407715,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0048,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018558364361524582,
"mask/share_reasoning": 0.8621881008148193,
"mask/share_step_conf": 0.10362851619720459,
"num_tokens": 4050643.0,
"reward": 0.6856704354286194,
"reward_std": 0.37686243653297424,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6563093662261963,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3892502188682556,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.5435507893562317,
"adv/mean_abs_reasoning": 0.5240600109100342,
"adv/mean_abs_step_conf": 0.7819471955299377,
"adv/ratio_final_to_reasoning": 1.0371918826860145,
"adv/ratio_step_to_reasoning": 1.4920947587129965,
"adv/std_final_conf": 0.7767627835273743,
"adv/std_reasoning": 0.7755653858184814,
"adv/std_step_conf": 0.9358567595481873,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5077658303464754,
"calib/avg_num_step_conf": 7.19140625,
"calib/ece": 0.36823983739837396,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001515918195236221,
"calib/mean_conf": 0.9901910569105691,
"calib/mu_c": 0.9902483660130718,
"calib/mu_w": 0.9900967741935481,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36823983739837396,
"calib/std_conf": 0.0013282734150218745,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9124896193771628,
"calib/step_q_c_n": 1156.0,
"calib/step_q_gap": 0.006091079231177354,
"calib/step_q_w": 0.9063985401459854,
"calib/step_q_w_n": 685.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 847.62109375,
"completions/mean_terminated_length": 867.9640502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.02671867609024048,
"kl": 0.011602401733398438,
"learning_rate": 3.5e-06,
"loss": -0.0298,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.017643479630351067,
"mask/share_reasoning": 0.8625574707984924,
"mask/share_step_conf": 0.09636152535676956,
"num_tokens": 4373034.0,
"reward": 0.5819574594497681,
"reward_std": 0.41870835423469543,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6047573685646057,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.2474387288093567,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.3743641972541809,
"adv/mean_abs_reasoning": 0.350879430770874,
"adv/mean_abs_step_conf": 0.7743525505065918,
"adv/ratio_final_to_reasoning": 1.066931157610782,
"adv/ratio_step_to_reasoning": 2.206890694063648,
"adv/std_final_conf": 0.6611832976341248,
"adv/std_reasoning": 0.6403455138206482,
"adv/std_step_conf": 0.9354344010353088,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4946808510638298,
"calib/avg_num_step_conf": 7.54296875,
"calib/ece": 0.36603600000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -9.574468085082888e-05,
"calib/mean_conf": 0.990036,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900957446808508,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36603600000000003,
"calib/std_conf": 0.0005680704181701424,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9110808510638297,
"calib/step_q_c_n": 1175.0,
"calib/step_q_gap": 0.009850692333671085,
"calib/step_q_w": 0.9012301587301587,
"calib/step_q_w_n": 756.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 751.8203125,
"completions/mean_terminated_length": 766.7968139648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.016,
"grad_norm": 0.044145699590444565,
"kl": 0.0020477771759033203,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0193,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019235068932175636,
"mask/share_reasoning": 0.8573687672615051,
"mask/share_step_conf": 0.1038648933172226,
"num_tokens": 4673380.0,
"reward": 0.6289956569671631,
"reward_std": 0.3001975417137146,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6165511608123779,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.32425254583358765,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.3798687756061554,
"adv/mean_abs_reasoning": 0.36612629890441895,
"adv/mean_abs_step_conf": 0.787256121635437,
"adv/ratio_final_to_reasoning": 1.0375347980815879,
"adv/ratio_step_to_reasoning": 2.150231010422331,
"adv/std_final_conf": 0.6405870914459229,
"adv/std_reasoning": 0.6404175758361816,
"adv/std_step_conf": 0.9351932406425476,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.48214285714285715,
"calib/avg_num_step_conf": 7.640625,
"calib/ece": 0.32882258064516146,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0003333333333332966,
"calib/mean_conf": 0.9901129032258066,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9903333333333332,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32882258064516146,
"calib/std_conf": 0.0010216185562653634,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9119055374592835,
"calib/step_q_c_n": 1228.0,
"calib/step_q_gap": 0.0032544385581845248,
"calib/step_q_w": 0.908651098901099,
"calib/step_q_w_n": 728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2059.0,
"completions/max_terminated_length": 2059.0,
"completions/mean_length": 875.140625,
"completions/mean_terminated_length": 892.57373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 491.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.017418626695871353,
"kl": 0.0017644166946411133,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0353,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.016430053859949112,
"mask/share_reasoning": 0.8740938901901245,
"mask/share_step_conf": 0.08994480222463608,
"num_tokens": 5006264.0,
"reward": 0.617144763469696,
"reward_std": 0.2961219847202301,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6468729972839355,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.2655414640903473,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.4859449863433838,
"adv/mean_abs_reasoning": 0.4735613763332367,
"adv/mean_abs_step_conf": 0.7723269462585449,
"adv/ratio_final_to_reasoning": 1.026149957807017,
"adv/ratio_step_to_reasoning": 1.6308909147925785,
"adv/std_final_conf": 0.741020143032074,
"adv/std_reasoning": 0.7394781708717346,
"adv/std_step_conf": 0.9351745843887329,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4934108527131783,
"calib/avg_num_step_conf": 7.69921875,
"calib/ece": 0.2933481781376518,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9959514170040485,
"calib/gap": -0.0005967441860463829,
"calib/mean_conf": 0.9897044534412955,
"calib/mu_c": 0.9895232558139534,
"calib/mu_w": 0.9901199999999998,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2933481781376518,
"calib/std_conf": 0.005839078563134405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.908682059046177,
"calib/step_q_c_n": 1321.0,
"calib/step_q_gap": -0.004825633261515194,
"calib/step_q_w": 0.9135076923076922,
"calib/step_q_w_n": 650.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1491.0,
"completions/max_terminated_length": 1491.0,
"completions/mean_length": 784.41015625,
"completions/mean_terminated_length": 809.7136840820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.019946565851569176,
"kl": 0.0026388168334960938,
"learning_rate": 4.25e-06,
"loss": -0.0894,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017837045714259148,
"mask/share_reasoning": 0.8502132892608643,
"mask/share_step_conf": 0.10069967806339264,
"num_tokens": 5310601.0,
"reward": 0.7013490200042725,
"reward_std": 0.37124860286712646,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6775288581848145,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.3978252708911896,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.4173857271671295,
"adv/mean_abs_reasoning": 0.41300535202026367,
"adv/mean_abs_step_conf": 0.7370244264602661,
"adv/ratio_final_to_reasoning": 1.0106060977792146,
"adv/ratio_step_to_reasoning": 1.7845396502854636,
"adv/std_final_conf": 0.7005000114440918,
"adv/std_reasoning": 0.7013863921165466,
"adv/std_step_conf": 0.9352988004684448,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5036764705882353,
"calib/avg_num_step_conf": 7.109375,
"calib/ece": 0.434934693877551,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.617647058837495e-05,
"calib/mean_conf": 0.9900367346938775,
"calib/mu_c": 0.9900661764705883,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.434934693877551,
"calib/std_conf": 0.0005738142619033464,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.912990099009901,
"calib/step_q_c_n": 1010.0,
"calib/step_q_gap": 0.008213555800024586,
"calib/step_q_w": 0.9047765432098764,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2769.0,
"completions/max_terminated_length": 2769.0,
"completions/mean_length": 828.5,
"completions/mean_terminated_length": 855.2257690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.0192,
"grad_norm": 0.019809581339359283,
"kl": 0.004322528839111328,
"learning_rate": 4.5e-06,
"loss": -0.033,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017656367272138596,
"mask/share_reasoning": 0.8579134941101074,
"mask/share_step_conf": 0.09318015724420547,
"num_tokens": 5633417.0,
"reward": 0.48591434955596924,
"reward_std": 0.3093951344490051,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5396703481674194,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.1345020830631256,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.41409170627593994,
"adv/mean_abs_reasoning": 0.3675980269908905,
"adv/mean_abs_step_conf": 0.7662495374679565,
"adv/ratio_final_to_reasoning": 1.126479675817742,
"adv/ratio_step_to_reasoning": 2.0844767414568985,
"adv/std_final_conf": 0.70262211561203,
"adv/std_reasoning": 0.6612752079963684,
"adv/std_step_conf": 0.9340653419494629,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.48236096673596673,
"calib/avg_num_step_conf": 7.51171875,
"calib/ece": 0.4024480158730158,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.0004430093555092274,
"calib/mean_conf": 0.9897496031746031,
"calib/mu_c": 0.9899324324324323,
"calib/mu_w": 0.9894894230769231,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4024480158730158,
"calib/std_conf": 0.005816373290567585,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.911044104410441,
"calib/step_q_c_n": 1111.0,
"calib/step_q_gap": 0.0006953359375346801,
"calib/step_q_w": 0.9103487684729064,
"calib/step_q_w_n": 812.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 795.19921875,
"completions/mean_terminated_length": 801.4606323242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 485.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.024985479190945625,
"kl": 0.004532575607299805,
"learning_rate": 4.75e-06,
"loss": -0.0252,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018527554348111153,
"mask/share_reasoning": 0.8725011348724365,
"mask/share_step_conf": 0.10115884244441986,
"num_tokens": 5941748.0,
"reward": 0.581405520439148,
"reward_std": 0.3006322979927063,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5865280628204346,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.26378291845321655,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.5181697607040405,
"adv/mean_abs_reasoning": 0.5007547736167908,
"adv/mean_abs_step_conf": 0.7679487466812134,
"adv/ratio_final_to_reasoning": 1.0347774759319155,
"adv/ratio_step_to_reasoning": 1.5335824781750285,
"adv/std_final_conf": 0.7582629323005676,
"adv/std_reasoning": 0.7577385902404785,
"adv/std_step_conf": 0.9357162714004517,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5255102040816326,
"calib/avg_num_step_conf": 7.671875,
"calib/ece": 0.3893760330578513,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9793388429752066,
"calib/gap": 0.013768565759637363,
"calib/mean_conf": 0.9844173553719009,
"calib/mu_c": 0.9899930555555557,
"calib/mu_w": 0.9762244897959184,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3893760330578513,
"calib/std_conf": 0.06444739180417867,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9136850044365572,
"calib/step_q_c_n": 1127.0,
"calib/step_q_gap": 0.008189186037512841,
"calib/step_q_w": 0.9054958183990444,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 795.08203125,
"completions/mean_terminated_length": 824.0526733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 469.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.023194918408989906,
"kl": 0.005844593048095703,
"learning_rate": 5e-06,
"loss": -0.0762,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.01830507069826126,
"mask/share_reasoning": 0.8429714441299438,
"mask/share_step_conf": 0.1035672202706337,
"num_tokens": 6250161.0,
"reward": 0.5385861992835999,
"reward_std": 0.3656718134880066,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5765472650527954,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.19906258583068848,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.5136041045188904,
"adv/mean_abs_reasoning": 0.5023784637451172,
"adv/mean_abs_step_conf": 0.7856019735336304,
"adv/ratio_final_to_reasoning": 1.0223449880595767,
"adv/ratio_step_to_reasoning": 1.5637652292599216,
"adv/std_final_conf": 0.7410447597503662,
"adv/std_reasoning": 0.7393964529037476,
"adv/std_step_conf": 0.9348511099815369,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5060240963855421,
"calib/avg_num_step_conf": 7.796875,
"calib/ece": 0.3339328063241106,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 9.036144578344896e-05,
"calib/mean_conf": 0.9900592885375493,
"calib/mu_c": 0.9900903614457831,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3339328063241106,
"calib/std_conf": 0.0007003970413703307,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9135787401574804,
"calib/step_q_c_n": 1270.0,
"calib/step_q_gap": 0.0030828723888853826,
"calib/step_q_w": 0.910495867768595,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1549.0,
"completions/max_terminated_length": 1549.0,
"completions/mean_length": 786.19921875,
"completions/mean_terminated_length": 795.5217895507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 369.0,
"epoch": 0.0224,
"grad_norm": 0.014077394269406796,
"kl": 0.0046427249908447266,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0203,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.018851812928915024,
"mask/share_reasoning": 0.8657242059707642,
"mask/share_step_conf": 0.10370523482561111,
"num_tokens": 6554388.0,
"reward": 0.6516726016998291,
"reward_std": 0.41838204860687256,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.655136227607727,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.32086512446403503,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.38115063309669495,
"adv/mean_abs_reasoning": 0.37618327140808105,
"adv/mean_abs_step_conf": 0.7508043050765991,
"adv/ratio_final_to_reasoning": 1.0132046320667603,
"adv/ratio_step_to_reasoning": 1.9958471365998933,
"adv/std_final_conf": 0.6787269711494446,
"adv/std_reasoning": 0.6815654039382935,
"adv/std_step_conf": 0.935329794883728,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.80859375,
"calib/ece": 0.34080645161290335,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34080645161290335,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9151821862348178,
"calib/step_q_c_n": 1235.0,
"calib/step_q_gap": -0.004241897534815742,
"calib/step_q_w": 0.9194240837696336,
"calib/step_q_w_n": 764.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2856.0,
"completions/max_terminated_length": 2856.0,
"completions/mean_length": 779.8046875,
"completions/mean_terminated_length": 795.3386840820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.15635307133197784,
"kl": 0.022773265838623047,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0687,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01855219155550003,
"mask/share_reasoning": 0.8577111959457397,
"mask/share_step_conf": 0.10420538485050201,
"num_tokens": 6855834.0,
"reward": 0.6378905773162842,
"reward_std": 0.28502583503723145,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6356062293052673,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.32064372301101685,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.47258201241493225,
"adv/mean_abs_reasoning": 0.45605725049972534,
"adv/mean_abs_step_conf": 0.7994668483734131,
"adv/ratio_final_to_reasoning": 1.0362339638216471,
"adv/ratio_step_to_reasoning": 1.7529966851692331,
"adv/std_final_conf": 0.7200669646263123,
"adv/std_reasoning": 0.720693826675415,
"adv/std_step_conf": 0.9347937107086182,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.55078125,
"calib/ece": 0.41799600000000015,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 9.345794392379148e-06,
"calib/mean_conf": 0.9899960000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899906542056074,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.41799600000000015,
"calib/std_conf": 0.0008508724933854668,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9142924086223055,
"calib/step_q_c_n": 1067.0,
"calib/step_q_gap": -0.0032410786756159604,
"calib/step_q_w": 0.9175334872979215,
"calib/step_q_w_n": 866.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2410.0,
"completions/max_terminated_length": 2410.0,
"completions/mean_length": 800.8828125,
"completions/mean_terminated_length": 816.836669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.01701655611395836,
"kl": 0.007472038269042969,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0472,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01825655996799469,
"mask/share_reasoning": 0.8625786900520325,
"mask/share_step_conf": 0.09963353723287582,
"num_tokens": 7164796.0,
"reward": 0.5432964563369751,
"reward_std": 0.36795392632484436,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5667847394943237,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.2135581225156784,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.5871478319168091,
"adv/mean_abs_reasoning": 0.552639365196228,
"adv/mean_abs_step_conf": 0.7762830257415771,
"adv/ratio_final_to_reasoning": 1.0624430123763045,
"adv/ratio_step_to_reasoning": 1.404682826866557,
"adv/std_final_conf": 0.8089284300804138,
"adv/std_reasoning": 0.792900562286377,
"adv/std_step_conf": 0.9361394047737122,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5113929201139291,
"calib/avg_num_step_conf": 7.4375,
"calib/ece": 0.3977732793522267,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9878542510121457,
"calib/gap": 0.0012647497626475657,
"calib/mean_conf": 0.9888663967611336,
"calib/mu_c": 0.9893835616438356,
"calib/mu_w": 0.988118811881188,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3977732793522267,
"calib/std_conf": 0.009874227503863084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.914830659536542,
"calib/step_q_c_n": 1122.0,
"calib/step_q_gap": 0.005738587925288696,
"calib/step_q_w": 0.9090920716112533,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2658.0,
"completions/max_terminated_length": 2658.0,
"completions/mean_length": 843.9375,
"completions/mean_terminated_length": 860.7490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.0256,
"grad_norm": 0.030872756615281105,
"kl": 0.007044792175292969,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0169,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01767241582274437,
"mask/share_reasoning": 0.8637890815734863,
"mask/share_step_conf": 0.09900720417499542,
"num_tokens": 7485356.0,
"reward": 0.5760669708251953,
"reward_std": 0.45210501551628113,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5794737935066223,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.26562875509262085,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.348545640707016,
"adv/mean_abs_reasoning": 0.3495147228240967,
"adv/mean_abs_step_conf": 0.7539834380149841,
"adv/ratio_final_to_reasoning": 0.9972273496542564,
"adv/ratio_step_to_reasoning": 2.1572294063115844,
"adv/std_final_conf": 0.6166334748268127,
"adv/std_reasoning": 0.6185421347618103,
"adv/std_step_conf": 0.9329771995544434,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.2578125,
"calib/ece": 0.34856573705179283,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34856573705179283,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9151369863013697,
"calib/step_q_c_n": 1168.0,
"calib/step_q_gap": 0.0017456819535436319,
"calib/step_q_w": 0.9133913043478261,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 785.921875,
"completions/mean_terminated_length": 789.0039672851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 358.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02121826820075512,
"kl": 0.007907867431640625,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0238,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01904113218188286,
"mask/share_reasoning": 0.876044750213623,
"mask/share_step_conf": 0.10100783407688141,
"num_tokens": 7789776.0,
"reward": 0.6249831318855286,
"reward_std": 0.2563919126987457,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6358394622802734,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.2922517657279968,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.4268883466720581,
"adv/mean_abs_reasoning": 0.41854697465896606,
"adv/mean_abs_step_conf": 0.7563217878341675,
"adv/ratio_final_to_reasoning": 1.0199293568419379,
"adv/ratio_step_to_reasoning": 1.8070176912649334,
"adv/std_final_conf": 0.7015519142150879,
"adv/std_reasoning": 0.7013825178146362,
"adv/std_step_conf": 0.9330384135246277,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.1171875,
"calib/ece": 0.34317460317460324,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34317460317460324,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9113718723037103,
"calib/step_q_c_n": 1159.0,
"calib/step_q_gap": -0.0010564836540575273,
"calib/step_q_w": 0.9124283559577678,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 754.95703125,
"completions/mean_terminated_length": 760.9015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.01634961925446987,
"kl": 0.010264396667480469,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019156184047460556,
"mask/share_reasoning": 0.8739191293716431,
"mask/share_step_conf": 0.0991121456027031,
"num_tokens": 8088285.0,
"reward": 0.667913019657135,
"reward_std": 0.2968505620956421,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6435734033584595,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.36803382635116577,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.5693494081497192,
"adv/mean_abs_reasoning": 0.5429830551147461,
"adv/mean_abs_step_conf": 0.7924162745475769,
"adv/ratio_final_to_reasoning": 1.048558334899422,
"adv/ratio_step_to_reasoning": 1.4593756970558118,
"adv/std_final_conf": 0.7916540503501892,
"adv/std_reasoning": 0.7754848599433899,
"adv/std_step_conf": 0.9357017278671265,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49655720338983045,
"calib/avg_num_step_conf": 7.5390625,
"calib/ece": 0.4689837398373984,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9959349593495935,
"calib/gap": -0.0010023834745759297,
"calib/mean_conf": 0.989308943089431,
"calib/mu_c": 0.9888281250000002,
"calib/mu_w": 0.9898305084745761,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4689837398373984,
"calib/std_conf": 0.006370192376644552,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9138975966562173,
"calib/step_q_c_n": 957.0,
"calib/step_q_gap": -0.001621416704522649,
"calib/step_q_w": 0.9155190133607399,
"calib/step_q_w_n": 973.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2679.0,
"completions/max_terminated_length": 2679.0,
"completions/mean_length": 775.19140625,
"completions/mean_terminated_length": 796.98388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.0288,
"grad_norm": 0.03222472965717316,
"kl": 0.0107879638671875,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0881,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01808036118745804,
"mask/share_reasoning": 0.8571000099182129,
"mask/share_step_conf": 0.09747587144374847,
"num_tokens": 8391950.0,
"reward": 0.48863452672958374,
"reward_std": 0.4100489318370819,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5092262029647827,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.17585542798042297,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.4156338572502136,
"adv/mean_abs_reasoning": 0.38927507400512695,
"adv/mean_abs_step_conf": 0.7773406505584717,
"adv/ratio_final_to_reasoning": 1.0677124866328829,
"adv/ratio_step_to_reasoning": 1.9968929491443208,
"adv/std_final_conf": 0.6787872910499573,
"adv/std_reasoning": 0.6614362001419067,
"adv/std_step_conf": 0.9353095293045044,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5175358970127084,
"calib/avg_num_step_conf": 6.5,
"calib/ece": 0.2940585774058576,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.9874476987447699,
"calib/gap": 0.00274550255817807,
"calib/mean_conf": 0.9886192468619246,
"calib/mu_c": 0.9894578313253012,
"calib/mu_w": 0.9867123287671231,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2940585774058576,
"calib/std_conf": 0.012177960658197895,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9127568493150684,
"calib/step_q_c_n": 1168.0,
"calib/step_q_gap": 0.007131849315068273,
"calib/step_q_w": 0.9056250000000001,
"calib/step_q_w_n": 496.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2551.0,
"completions/max_terminated_length": 2551.0,
"completions/mean_length": 780.625,
"completions/mean_terminated_length": 793.0159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.030569037422537804,
"kl": 0.013233184814453125,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0525,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.018613245338201523,
"mask/share_reasoning": 0.8756033182144165,
"mask/share_step_conf": 0.09015839546918869,
"num_tokens": 8698734.0,
"reward": 0.661705732345581,
"reward_std": 0.299774169921875,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6507663726806641,
"rewards/format_reward_step": 0.92578125,
"rewards/step_correlation_reward": 0.35780128836631775,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.4315052628517151,
"adv/mean_abs_reasoning": 0.39599186182022095,
"adv/mean_abs_step_conf": 0.7412615418434143,
"adv/ratio_final_to_reasoning": 1.0896821486892503,
"adv/ratio_step_to_reasoning": 1.8719110499799734,
"adv/std_final_conf": 0.7103040218353271,
"adv/std_reasoning": 0.70142662525177,
"adv/std_step_conf": 0.9351839423179626,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.4931972789115646,
"calib/avg_num_step_conf": 6.7265625,
"calib/ece": 0.38733606557377054,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0003401360544218468,
"calib/mean_conf": 0.9897950819672131,
"calib/mu_c": 0.9896598639455781,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38733606557377054,
"calib/std_conf": 0.002299103619304223,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9086032977691563,
"calib/step_q_c_n": 1031.0,
"calib/step_q_gap": 0.00427623554050216,
"calib/step_q_w": 0.9043270622286541,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2496.0,
"completions/max_terminated_length": 2496.0,
"completions/mean_length": 849.328125,
"completions/mean_terminated_length": 869.7120361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.016793156042695045,
"kl": 0.011755943298339844,
"learning_rate": 4.75e-06,
"loss": -0.0486,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.0170335303992033,
"mask/share_reasoning": 0.8732938766479492,
"mask/share_step_conf": 0.0862351506948471,
"num_tokens": 9023290.0,
"reward": 0.5508725643157959,
"reward_std": 0.2956800162792206,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.5816925764083862,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.21458379924297333,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.5231447219848633,
"adv/mean_abs_reasoning": 0.4696976840496063,
"adv/mean_abs_step_conf": 0.7574329376220703,
"adv/ratio_final_to_reasoning": 1.1137902948007983,
"adv/ratio_step_to_reasoning": 1.6125967049521908,
"adv/std_final_conf": 0.778552234172821,
"adv/std_reasoning": 0.7393490076065063,
"adv/std_step_conf": 0.9359943866729736,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5002841716396703,
"calib/avg_num_step_conf": 7.2265625,
"calib/ece": 0.3659959183673469,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9959183673469387,
"calib/gap": -0.000948920147769039,
"calib/mean_conf": 0.9888530612244898,
"calib/mu_c": 0.9884967320261439,
"calib/mu_w": 0.9894456521739129,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.36517959183673465,
"calib/std_conf": 0.01233192475568696,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9107060900264783,
"calib/step_q_c_n": 1133.0,
"calib/step_q_gap": 0.004667038422572989,
"calib/step_q_w": 0.9060390516039053,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2112.0,
"completions/max_terminated_length": 2112.0,
"completions/mean_length": 824.5390625,
"completions/mean_terminated_length": 837.6270141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 454.0,
"epoch": 0.032,
"grad_norm": 0.030233023688197136,
"kl": 0.01520538330078125,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0384,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017376385629177094,
"mask/share_reasoning": 0.8753885626792908,
"mask/share_step_conf": 0.09161008894443512,
"num_tokens": 9341356.0,
"reward": 0.6088252067565918,
"reward_std": 0.36066246032714844,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6049777269363403,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.3017350435256958,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.5415816307067871,
"adv/mean_abs_reasoning": 0.44430142641067505,
"adv/mean_abs_step_conf": 0.7793623208999634,
"adv/ratio_final_to_reasoning": 1.2189509160076257,
"adv/ratio_step_to_reasoning": 1.7541296844264147,
"adv/std_final_conf": 0.7959402203559875,
"adv/std_reasoning": 0.7207141518592834,
"adv/std_step_conf": 0.935605525970459,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5253833950290852,
"calib/avg_num_step_conf": 7.27734375,
"calib/ece": 0.4842276422764228,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.991869918699187,
"calib/gap": 0.0018164992067687402,
"calib/mean_conf": 0.9882926829268293,
"calib/mu_c": 0.9891935483870967,
"calib/mu_w": 0.987377049180328,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4842276422764228,
"calib/std_conf": 0.008853580147024696,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9126297968397293,
"calib/step_q_c_n": 886.0,
"calib/step_q_gap": 0.019088343410865405,
"calib/step_q_w": 0.8935414534288639,
"calib/step_q_w_n": 977.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3005.0,
"completions/max_terminated_length": 3005.0,
"completions/mean_length": 894.51171875,
"completions/mean_terminated_length": 912.3306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.02490750513970852,
"kl": 0.015928268432617188,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0297,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01670205220580101,
"mask/share_reasoning": 0.8775709867477417,
"mask/share_step_conf": 0.08619573712348938,
"num_tokens": 9676263.0,
"reward": 0.450461208820343,
"reward_std": 0.3570416271686554,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.49612146615982056,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.11651960015296936,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.5400800108909607,
"adv/mean_abs_reasoning": 0.45109522342681885,
"adv/mean_abs_step_conf": 0.7773272395133972,
"adv/ratio_final_to_reasoning": 1.1972638654609427,
"adv/ratio_step_to_reasoning": 1.7231998902766101,
"adv/std_final_conf": 0.784846842288971,
"adv/std_reasoning": 0.7394152879714966,
"adv/std_step_conf": 0.9346668124198914,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5321026197909596,
"calib/avg_num_step_conf": 6.65234375,
"calib/ece": 0.4194693877551021,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9877551020408163,
"calib/gap": 0.003201438848920768,
"calib/mean_conf": 0.9868163265306124,
"calib/mu_c": 0.9882014388489208,
"calib/mu_w": 0.985,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4194693877551021,
"calib/std_conf": 0.011835045621921766,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9128676470588235,
"calib/step_q_c_n": 952.0,
"calib/step_q_gap": 0.006036754915015097,
"calib/step_q_w": 0.9068308921438084,
"calib/step_q_w_n": 751.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2759.0,
"completions/max_terminated_length": 2759.0,
"completions/mean_length": 804.30078125,
"completions/mean_terminated_length": 830.2459716796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.046489838510751724,
"kl": 0.021024703979492188,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0532,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017999248579144478,
"mask/share_reasoning": 0.8620076179504395,
"mask/share_step_conf": 0.08874315023422241,
"num_tokens": 9988868.0,
"reward": 0.533851146697998,
"reward_std": 0.3655565083026886,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5549355745315552,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.213547945022583,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.5530176162719727,
"adv/mean_abs_reasoning": 0.4229363203048706,
"adv/mean_abs_step_conf": 0.754909098148346,
"adv/ratio_final_to_reasoning": 1.3075670963263073,
"adv/ratio_step_to_reasoning": 1.7849237861722898,
"adv/std_final_conf": 0.7939488887786865,
"adv/std_reasoning": 0.7013248205184937,
"adv/std_step_conf": 0.9332074522972107,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5538732394366197,
"calib/avg_num_step_conf": 6.875,
"calib/ece": 0.4250396825396827,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.001911651728553232,
"calib/mean_conf": 0.9885317460317462,
"calib/mu_c": 0.9893661971830987,
"calib/mu_w": 0.9874545454545455,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4250396825396827,
"calib/std_conf": 0.00659387222500005,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9126046986721145,
"calib/step_q_c_n": 979.0,
"calib/step_q_gap": 0.006138629529988893,
"calib/step_q_w": 0.9064660691421256,
"calib/step_q_w_n": 781.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 748.5625,
"completions/mean_terminated_length": 760.4444580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.0352,
"grad_norm": 0.09581245481967926,
"kl": 0.022441864013671875,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0162,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019542766734957695,
"mask/share_reasoning": 0.8670825958251953,
"mask/share_step_conf": 0.09774963557720184,
"num_tokens": 10287372.0,
"reward": 0.5962920188903809,
"reward_std": 0.3191433548927307,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5652187466621399,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3203340768814087,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.6945086121559143,
"adv/mean_abs_reasoning": 0.6001378297805786,
"adv/mean_abs_step_conf": 0.7928270101547241,
"adv/ratio_final_to_reasoning": 1.1572485147450868,
"adv/ratio_step_to_reasoning": 1.321074877823643,
"adv/std_final_conf": 0.8546317219734192,
"adv/std_reasoning": 0.8099603652954102,
"adv/std_step_conf": 0.9364176988601685,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.481389252948886,
"calib/avg_num_step_conf": 6.953125,
"calib/ece": 0.4232530120481929,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9839357429718876,
"calib/gap": -4.5871559635202175e-06,
"calib/mean_conf": 0.9855020080321286,
"calib/mu_c": 0.9854999999999998,
"calib/mu_w": 0.9855045871559633,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4232530120481929,
"calib/std_conf": 0.011883343443218902,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9124770642201836,
"calib/step_q_c_n": 981.0,
"calib/step_q_gap": 0.0011754371863912372,
"calib/step_q_w": 0.9113016270337924,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1629.0,
"completions/max_terminated_length": 1629.0,
"completions/mean_length": 747.47265625,
"completions/mean_terminated_length": 753.3582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.03602541610598564,
"kl": 0.029666900634765625,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0109,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01917748898267746,
"mask/share_reasoning": 0.8739994764328003,
"mask/share_step_conf": 0.09901049733161926,
"num_tokens": 10583837.0,
"reward": 0.5894248485565186,
"reward_std": 0.45810455083847046,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5588769316673279,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.31606653332710266,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.5951634645462036,
"adv/mean_abs_reasoning": 0.46525806188583374,
"adv/mean_abs_step_conf": 0.7743443846702576,
"adv/ratio_final_to_reasoning": 1.279211502824526,
"adv/ratio_step_to_reasoning": 1.6643330833034942,
"adv/std_final_conf": 0.8165079951286316,
"adv/std_reasoning": 0.7393738031387329,
"adv/std_step_conf": 0.9352921843528748,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5283114943309117,
"calib/avg_num_step_conf": 7.22265625,
"calib/ece": 0.3974796747967483,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.967479674796748,
"calib/gap": 0.0045827958449314865,
"calib/mean_conf": 0.9787804878048784,
"calib/mu_c": 0.9806993006993008,
"calib/mu_w": 0.9761165048543693,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3974796747967483,
"calib/std_conf": 0.01734796283130076,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9066231155778894,
"calib/step_q_c_n": 995.0,
"calib/step_q_gap": -6.306709189984883e-05,
"calib/step_q_w": 0.9066861826697893,
"calib/step_q_w_n": 854.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2750.0,
"completions/max_terminated_length": 2750.0,
"completions/mean_length": 812.34375,
"completions/mean_terminated_length": 838.54833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 473.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.072185218334198,
"kl": 0.034183502197265625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.1212,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.017634199932217598,
"mask/share_reasoning": 0.8655506372451782,
"mask/share_step_conf": 0.08556517958641052,
"num_tokens": 10901053.0,
"reward": 0.5513752698898315,
"reward_std": 0.343622088432312,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5770906209945679,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.22175362706184387,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.6594531536102295,
"adv/mean_abs_reasoning": 0.3306323289871216,
"adv/mean_abs_step_conf": 0.7618266344070435,
"adv/ratio_final_to_reasoning": 1.9945210912388298,
"adv/ratio_step_to_reasoning": 2.3041504644777713,
"adv/std_final_conf": 0.8654852509498596,
"adv/std_reasoning": 0.6185620427131653,
"adv/std_step_conf": 0.9333191514015198,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5999921235034656,
"calib/avg_num_step_conf": 7.75,
"calib/ece": 0.2398023715415022,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8300395256916996,
"calib/gap": 0.007934782608695734,
"calib/mean_conf": 0.9670750988142295,
"calib/mu_c": 0.9692391304347828,
"calib/mu_w": 0.9613043478260871,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2398023715415022,
"calib/std_conf": 0.03087879999322926,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.91320987654321,
"calib/step_q_c_n": 1458.0,
"calib/step_q_gap": 0.008457024832183246,
"calib/step_q_w": 0.9047528517110267,
"calib/step_q_w_n": 526.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2886.0,
"completions/max_terminated_length": 2886.0,
"completions/mean_length": 791.72265625,
"completions/mean_terminated_length": 791.72265625,
"completions/min_length": 325.0,
"completions/min_terminated_length": 325.0,
"epoch": 0.0384,
"grad_norm": 0.11893506348133087,
"kl": 0.048084259033203125,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0514,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019558222964406013,
"mask/share_reasoning": 0.8685697317123413,
"mask/share_step_conf": 0.11187203228473663,
"num_tokens": 11206446.0,
"reward": 0.7723823189735413,
"reward_std": 0.2554051876068115,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7375956773757935,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.4657626152038574,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.5802910327911377,
"adv/mean_abs_reasoning": 0.41940683126449585,
"adv/mean_abs_step_conf": 0.7726558446884155,
"adv/ratio_final_to_reasoning": 1.3835993825889341,
"adv/ratio_step_to_reasoning": 1.8422586068970006,
"adv/std_final_conf": 0.8165745735168457,
"adv/std_reasoning": 0.701346755027771,
"adv/std_step_conf": 0.9323224425315857,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5297814207650273,
"calib/avg_num_step_conf": 7.4453125,
"calib/ece": 0.4716942148760333,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8677685950413223,
"calib/gap": 0.0015300546448088204,
"calib/mean_conf": 0.9675619834710747,
"calib/mu_c": 0.9683333333333337,
"calib/mu_w": 0.9668032786885249,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4716942148760333,
"calib/std_conf": 0.030223791340730083,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.904900442477876,
"calib/step_q_c_n": 904.0,
"calib/step_q_gap": 0.011297648066698307,
"calib/step_q_w": 0.8936027944111777,
"calib/step_q_w_n": 1002.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 782.68359375,
"completions/mean_terminated_length": 814.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.04534284025430679,
"kl": 0.048694610595703125,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0298,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018106989562511444,
"mask/share_reasoning": 0.8466689586639404,
"mask/share_step_conf": 0.09616149961948395,
"num_tokens": 11513909.0,
"reward": 0.44464361667633057,
"reward_std": 0.29512786865234375,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.49853241443634033,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.10794225335121155,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.6019595861434937,
"adv/mean_abs_reasoning": 0.4334124028682709,
"adv/mean_abs_step_conf": 0.7732325792312622,
"adv/ratio_final_to_reasoning": 1.3888840793659754,
"adv/ratio_step_to_reasoning": 1.784057341493004,
"adv/std_final_conf": 0.8388135433197021,
"adv/std_reasoning": 0.7206992506980896,
"adv/std_step_conf": 0.9356791377067566,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5753998742753371,
"calib/avg_num_step_conf": 7.16796875,
"calib/ece": 0.39785123966942176,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8966942148760331,
"calib/gap": 0.0067542082838580875,
"calib/mean_conf": 0.9705785123966945,
"calib/mu_c": 0.9734532374100722,
"calib/mu_w": 0.9666990291262141,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.39702479338843005,
"calib/std_conf": 0.02669997123411818,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9112463768115943,
"calib/step_q_c_n": 1035.0,
"calib/step_q_gap": 0.009346376811594337,
"calib/step_q_w": 0.9018999999999999,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 797.734375,
"completions/mean_terminated_length": 816.8800659179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 461.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.037656933069229126,
"kl": 0.05291748046875,
"learning_rate": 4.5e-06,
"loss": 0.0564,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.01798248291015625,
"mask/share_reasoning": 0.8641847968101501,
"mask/share_step_conf": 0.09439519047737122,
"num_tokens": 11825017.0,
"reward": 0.5232806205749512,
"reward_std": 0.3373889923095703,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5682734251022339,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.1806315928697586,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.662196695804596,
"adv/mean_abs_reasoning": 0.4865483045578003,
"adv/mean_abs_step_conf": 0.7729369401931763,
"adv/ratio_final_to_reasoning": 1.361009152845438,
"adv/ratio_step_to_reasoning": 1.5886129556975035,
"adv/std_final_conf": 0.8549278378486633,
"adv/std_reasoning": 0.7576348185539246,
"adv/std_step_conf": 0.9355428218841553,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5317412736477485,
"calib/avg_num_step_conf": 7.30859375,
"calib/ece": 0.4031578947368425,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9068825910931174,
"calib/gap": 0.012642552624566816,
"calib/mean_conf": 0.9659109311740894,
"calib/mu_c": 0.9714388489208635,
"calib/mu_w": 0.9587962962962967,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.4031578947368425,
"calib/std_conf": 0.06733113652157344,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.9081381957773512,
"calib/step_q_c_n": 1042.0,
"calib/step_q_gap": 0.006678605910041102,
"calib/step_q_w": 0.9014595898673101,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2840.0,
"completions/max_terminated_length": 2840.0,
"completions/mean_length": 804.484375,
"completions/mean_terminated_length": 820.510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.0416,
"grad_norm": 0.29108595848083496,
"kl": 0.087554931640625,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0154,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01814432442188263,
"mask/share_reasoning": 0.8632323741912842,
"mask/share_step_conf": 0.09909210354089737,
"num_tokens": 12137053.0,
"reward": 0.5462459325790405,
"reward_std": 0.3595222234725952,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5637624859809875,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.22951051592826843,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.6811327338218689,
"adv/mean_abs_reasoning": 0.4848672151565552,
"adv/mean_abs_step_conf": 0.7687471508979797,
"adv/ratio_final_to_reasoning": 1.4047819950085572,
"adv/ratio_step_to_reasoning": 1.5854797496460236,
"adv/std_final_conf": 0.8914880156517029,
"adv/std_reasoning": 0.7576228380203247,
"adv/std_step_conf": 0.9358471632003784,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5200105666358473,
"calib/avg_num_step_conf": 7.671875,
"calib/ece": 0.4160323886639678,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8178137651821862,
"calib/gap": -0.0008922203143574281,
"calib/mean_conf": 0.9585425101214577,
"calib/mu_c": 0.9581343283582091,
"calib/mu_w": 0.9590265486725665,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4160323886639678,
"calib/std_conf": 0.03918580795538473,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9042110762800419,
"calib/step_q_c_n": 957.0,
"calib/step_q_gap": 0.013357054432971416,
"calib/step_q_w": 0.8908540218470705,
"calib/step_q_w_n": 1007.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2489.0,
"completions/max_terminated_length": 2489.0,
"completions/mean_length": 827.82421875,
"completions/mean_terminated_length": 844.3147583007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 451.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.028931770473718643,
"kl": 0.07387542724609375,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0035,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017673280090093613,
"mask/share_reasoning": 0.8669548630714417,
"mask/share_step_conf": 0.09584060311317444,
"num_tokens": 12455736.0,
"reward": 0.49367809295654297,
"reward_std": 0.3905242681503296,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5518601536750793,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.13940228521823883,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.6893098950386047,
"adv/mean_abs_reasoning": 0.46814632415771484,
"adv/mean_abs_step_conf": 0.7652798891067505,
"adv/ratio_final_to_reasoning": 1.4724240252848415,
"adv/ratio_step_to_reasoning": 1.6347023347532121,
"adv/std_final_conf": 0.8758644461631775,
"adv/std_reasoning": 0.739457368850708,
"adv/std_step_conf": 0.9358088374137878,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5544559809647416,
"calib/avg_num_step_conf": 8.25,
"calib/ece": 0.14963562753036444,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8704453441295547,
"calib/gap": 0.01726908933592952,
"calib/mean_conf": 0.9634008097165994,
"calib/mu_c": 0.9666169154228856,
"calib/mu_w": 0.9493478260869561,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14963562753036444,
"calib/std_conf": 0.03984500943245332,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8970809061488674,
"calib/step_q_c_n": 1545.0,
"calib/step_q_gap": -0.031472885738257905,
"calib/step_q_w": 0.9285537918871253,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2010.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 774.76953125,
"completions/mean_terminated_length": 783.95654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.03703810274600983,
"kl": 0.09344482421875,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0483,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019327331334352493,
"mask/share_reasoning": 0.8618630170822144,
"mask/share_step_conf": 0.10709092766046524,
"num_tokens": 12761325.0,
"reward": 0.8326570987701416,
"reward_std": 0.36482173204421997,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.7927339673042297,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.5241426825523376,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7318501472473145,
"adv/mean_abs_reasoning": 0.39691048860549927,
"adv/mean_abs_step_conf": 0.7763651609420776,
"adv/ratio_final_to_reasoning": 1.843866988293981,
"adv/ratio_step_to_reasoning": 1.9560207735244035,
"adv/std_final_conf": 0.9062535762786865,
"adv/std_reasoning": 0.7015131115913391,
"adv/std_step_conf": 0.9363811016082764,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.494484412470024,
"calib/avg_num_step_conf": 7.34375,
"calib/ece": 0.39168032786885265,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.8647540983606558,
"calib/gap": 0.009730044535799953,
"calib/mean_conf": 0.9613524590163937,
"calib/mu_c": 0.9655395683453238,
"calib/mu_w": 0.9558095238095239,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.39168032786885265,
"calib/std_conf": 0.04527030332133524,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.8892323030907279,
"calib/step_q_c_n": 1003.0,
"calib/step_q_gap": -0.012842953465714735,
"calib/step_q_w": 0.9020752565564426,
"calib/step_q_w_n": 877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 725.21875,
"completions/mean_terminated_length": 736.730224609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.0448,
"grad_norm": 0.026927631348371506,
"kl": 0.126861572265625,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0247,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.019635125994682312,
"mask/share_reasoning": 0.8590686321258545,
"mask/share_step_conf": 0.10567127168178558,
"num_tokens": 13051349.0,
"reward": 0.5205029845237732,
"reward_std": 0.3840179443359375,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.556390643119812,
"rewards/format_reward_step": 0.9296875,
"rewards/step_correlation_reward": 0.1900840848684311,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7539446353912354,
"adv/mean_abs_reasoning": 0.48671162128448486,
"adv/mean_abs_step_conf": 0.7926192879676819,
"adv/ratio_final_to_reasoning": 1.5490582152147785,
"adv/ratio_step_to_reasoning": 1.6285193393900754,
"adv/std_final_conf": 0.9174970984458923,
"adv/std_reasoning": 0.7395156025886536,
"adv/std_step_conf": 0.9361217021942139,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5945800706279747,
"calib/avg_num_step_conf": 7.27734375,
"calib/ece": 0.27718367346938777,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8326530612244898,
"calib/gap": 0.012746046368800723,
"calib/mean_conf": 0.9588163265306123,
"calib/mu_c": 0.962874251497006,
"calib/mu_w": 0.9501282051282053,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.27718367346938777,
"calib/std_conf": 0.03810601402499308,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8881299524564185,
"calib/step_q_c_n": 1262.0,
"calib/step_q_gap": -0.0035505799895049206,
"calib/step_q_w": 0.8916805324459234,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3036.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 798.6171875,
"completions/mean_terminated_length": 801.7490844726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.04725484177470207,
"kl": 0.1308135986328125,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0007,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01898164488375187,
"mask/share_reasoning": 0.8755961060523987,
"mask/share_step_conf": 0.10151597857475281,
"num_tokens": 13361019.0,
"reward": 0.6691012382507324,
"reward_std": 0.41626691818237305,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6680905818939209,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.34979933500289917,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7528752088546753,
"adv/mean_abs_reasoning": 0.5847516059875488,
"adv/mean_abs_step_conf": 0.7902089953422546,
"adv/ratio_final_to_reasoning": 1.2875128535700102,
"adv/ratio_step_to_reasoning": 1.3513584011585948,
"adv/std_final_conf": 0.9186169505119324,
"adv/std_reasoning": 0.8267921209335327,
"adv/std_step_conf": 0.9367049932479858,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.535816108339273,
"calib/avg_num_step_conf": 6.92578125,
"calib/ece": 0.47810126582278495,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.8649789029535865,
"calib/gap": 0.008359230220955194,
"calib/mean_conf": 0.9599578059071732,
"calib/mu_c": 0.9642608695652175,
"calib/mu_w": 0.9559016393442623,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.4764135021097048,
"calib/std_conf": 0.04752523549324154,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.8809450830140485,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.006056194125159564,
"calib/step_q_w": 0.8748888888888889,
"calib/step_q_w_n": 990.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2058.0,
"completions/max_terminated_length": 2058.0,
"completions/mean_length": 798.7109375,
"completions/mean_terminated_length": 824.4757690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03326358273625374,
"kl": 0.1516876220703125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.1195,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.01732334867119789,
"mask/share_reasoning": 0.8651239275932312,
"mask/share_step_conf": 0.0863027423620224,
"num_tokens": 13671809.0,
"reward": 0.40459272265434265,
"reward_std": 0.41235947608947754,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.4757663905620575,
"rewards/format_reward_step": 0.90625,
"rewards/step_correlation_reward": 0.06232527643442154,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7470870018005371,
"adv/mean_abs_reasoning": 0.5880221128463745,
"adv/mean_abs_step_conf": 0.8088736534118652,
"adv/ratio_final_to_reasoning": 1.2705083456542043,
"adv/ratio_step_to_reasoning": 1.3755837335715808,
"adv/std_final_conf": 0.905470609664917,
"adv/std_reasoning": 0.8432220220565796,
"adv/std_step_conf": 0.9366260170936584,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.559177646524307,
"calib/avg_num_step_conf": 8.69140625,
"calib/ece": 0.3577021276595747,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.87109375,
"calib/frac_conf_gt_0.9": 0.8851063829787233,
"calib/gap": 0.013560502801757024,
"calib/mean_conf": 0.9619574468085108,
"calib/mu_c": 0.9673239436619719,
"calib/mu_w": 0.9537634408602149,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.3577021276595747,
"calib/std_conf": 0.05073500509921511,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.8776923076923078,
"calib/step_q_c_n": 1040.0,
"calib/step_q_gap": -0.033767607919506504,
"calib/step_q_w": 0.9114599156118143,
"calib/step_q_w_n": 1185.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2564.0,
"completions/max_terminated_length": 2564.0,
"completions/mean_length": 749.8828125,
"completions/mean_terminated_length": 783.5509643554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.048,
"grad_norm": 0.028959548100829124,
"kl": 0.16290283203125,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0939,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.018562179058790207,
"mask/share_reasoning": 0.8374760150909424,
"mask/share_step_conf": 0.10099305957555771,
"num_tokens": 13968827.0,
"reward": 0.502269446849823,
"reward_std": 0.403462290763855,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5527081489562988,
"rewards/format_reward_step": 0.87109375,
"rewards/step_correlation_reward": 0.16667440533638,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.734359085559845,
"adv/mean_abs_reasoning": 0.606724739074707,
"adv/mean_abs_step_conf": 0.8087029457092285,
"adv/ratio_final_to_reasoning": 1.2103661483785684,
"adv/ratio_step_to_reasoning": 1.3328992434733267,
"adv/std_final_conf": 0.9238776564598083,
"adv/std_reasoning": 0.8433163166046143,
"adv/std_step_conf": 0.9367368817329407,
"calib/answer_extract_rate": 0.8671875,
"calib/auroc": 0.5771551367881643,
"calib/avg_num_step_conf": 7.45703125,
"calib/ece": 0.45181818181818206,
"calib/final_conf_rate": 0.859375,
"calib/format_rate": 0.81640625,
"calib/frac_conf_gt_0.9": 0.8181818181818182,
"calib/gap": 0.013158112240681086,
"calib/mean_conf": 0.9563636363636365,
"calib/mu_c": 0.9628828828828832,
"calib/mu_w": 0.9497247706422021,
"calib/nonempty_final_conf_rate": 0.859375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.45181818181818206,
"calib/std_conf": 0.04823667546073352,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.8656472081218276,
"calib/step_q_c_n": 788.0,
"calib/step_q_gap": -0.005030757979867295,
"calib/step_q_w": 0.8706779661016949,
"calib/step_q_w_n": 1121.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 844.49609375,
"completions/mean_terminated_length": 889.6748657226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.020218025892972946,
"kl": 0.149078369140625,
"learning_rate": 4.277777777777778e-06,
"loss": -0.101,
"mask/has_final_conf_rate": 0.859375,
"mask/share_final_conf": 0.01695256121456623,
"mask/share_reasoning": 0.8441320657730103,
"mask/share_step_conf": 0.08813411742448807,
"num_tokens": 14289786.0,
"reward": 0.39327341318130493,
"reward_std": 0.393865168094635,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.45081520080566406,
"rewards/format_reward_step": 0.81640625,
"rewards/step_correlation_reward": 0.08573156595230103,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7517024278640747,
"adv/mean_abs_reasoning": 0.629054844379425,
"adv/mean_abs_step_conf": 0.7849420309066772,
"adv/ratio_final_to_reasoning": 1.1949712089184272,
"adv/ratio_step_to_reasoning": 1.2478117574645466,
"adv/std_final_conf": 0.919205367565155,
"adv/std_reasoning": 0.8432762026786804,
"adv/std_step_conf": 0.9367557168006897,
"calib/answer_extract_rate": 0.890625,
"calib/auroc": 0.5730214621059692,
"calib/avg_num_step_conf": 6.74609375,
"calib/ece": 0.3327433628318586,
"calib/final_conf_rate": 0.8828125,
"calib/format_rate": 0.83984375,
"calib/frac_conf_gt_0.9": 0.911504424778761,
"calib/gap": 0.030679074446680166,
"calib/mean_conf": 0.9610619469026551,
"calib/mu_c": 0.9724647887323944,
"calib/mu_w": 0.9417857142857142,
"calib/nonempty_final_conf_rate": 0.8828125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.3327433628318586,
"calib/std_conf": 0.07747523534513975,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.8705286343612335,
"calib/step_q_c_n": 908.0,
"calib/step_q_gap": -0.0069072630746639385,
"calib/step_q_w": 0.8774358974358974,
"calib/step_q_w_n": 819.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2752.0,
"completions/max_terminated_length": 2752.0,
"completions/mean_length": 800.52734375,
"completions/mean_terminated_length": 823.0321044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.021257705986499786,
"kl": 0.1793212890625,
"learning_rate": 4.25e-06,
"loss": -0.0244,
"mask/has_final_conf_rate": 0.8828125,
"mask/share_final_conf": 0.017377939075231552,
"mask/share_reasoning": 0.8701250553131104,
"mask/share_step_conf": 0.0851532369852066,
"num_tokens": 14600697.0,
"reward": 0.5386103987693787,
"reward_std": 0.39626145362854004,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5514390468597412,
"rewards/format_reward_step": 0.83984375,
"rewards/step_correlation_reward": 0.24687549471855164,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7742482423782349,
"adv/mean_abs_reasoning": 0.6331319808959961,
"adv/mean_abs_step_conf": 0.7943623661994934,
"adv/ratio_final_to_reasoning": 1.222886010721704,
"adv/ratio_step_to_reasoning": 1.2546552538308477,
"adv/std_final_conf": 0.9287095665931702,
"adv/std_reasoning": 0.8592767715454102,
"adv/std_step_conf": 0.9367825388908386,
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.5647463768115943,
"calib/avg_num_step_conf": 6.9296875,
"calib/ece": 0.47548936170212774,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.84765625,
"calib/frac_conf_gt_0.9": 0.8893617021276595,
"calib/gap": 0.010768115942029088,
"calib/mean_conf": 0.952936170212766,
"calib/mu_c": 0.9584347826086957,
"calib/mu_w": 0.9476666666666667,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.4695319148936171,
"calib/std_conf": 0.10158476269039882,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.879713216957606,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": -0.00426826452387552,
"calib/step_q_w": 0.8839814814814815,
"calib/step_q_w_n": 972.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2720.0,
"completions/max_terminated_length": 2720.0,
"completions/mean_length": 754.78515625,
"completions/mean_terminated_length": 776.0039672851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.0512,
"grad_norm": 0.023023847490549088,
"kl": 0.19683837890625,
"learning_rate": 4.222222222222223e-06,
"loss": -0.1132,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.01880646124482155,
"mask/share_reasoning": 0.8589140176773071,
"mask/share_step_conf": 0.09493573009967804,
"num_tokens": 14897610.0,
"reward": 0.4060549736022949,
"reward_std": 0.4451637268066406,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.4412851333618164,
"rewards/format_reward_step": 0.84765625,
"rewards/step_correlation_reward": 0.11144982278347015,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7295511960983276,
"adv/mean_abs_reasoning": 0.5492097735404968,
"adv/mean_abs_step_conf": 0.7696366906166077,
"adv/ratio_final_to_reasoning": 1.3283652827138428,
"adv/ratio_step_to_reasoning": 1.4013528667837105,
"adv/std_final_conf": 0.9244924783706665,
"adv/std_reasoning": 0.8101789951324463,
"adv/std_step_conf": 0.9364129304885864,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.5738241792929293,
"calib/avg_num_step_conf": 7.296875,
"calib/ece": 0.33586206896551735,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.8663793103448276,
"calib/gap": 0.03034090909090914,
"calib/mean_conf": 0.9522413793103449,
"calib/mu_c": 0.9637500000000002,
"calib/mu_w": 0.9334090909090911,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.33370689655172425,
"calib/std_conf": 0.10317449403213547,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.8774825870646766,
"calib/step_q_c_n": 1005.0,
"calib/step_q_gap": 0.00204805635784,
"calib/step_q_w": 0.8754345307068366,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2975.0,
"completions/max_terminated_length": 2975.0,
"completions/mean_length": 752.1796875,
"completions/mean_terminated_length": 773.3252563476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.027003340423107147,
"kl": 0.1777801513671875,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0606,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.018772877752780914,
"mask/share_reasoning": 0.8578805923461914,
"mask/share_step_conf": 0.09600280225276947,
"num_tokens": 15194704.0,
"reward": 0.5183796882629395,
"reward_std": 0.36601462960243225,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5617023706436157,
"rewards/format_reward_step": 0.8515625,
"rewards/step_correlation_reward": 0.19146324694156647,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7670071721076965,
"adv/mean_abs_reasoning": 0.6311091184616089,
"adv/mean_abs_step_conf": 0.7808622717857361,
"adv/ratio_final_to_reasoning": 1.2153321029132849,
"adv/ratio_step_to_reasoning": 1.2372856752397514,
"adv/std_final_conf": 0.9278801679611206,
"adv/std_reasoning": 0.8593429327011108,
"adv/std_step_conf": 0.9367603063583374,
"calib/answer_extract_rate": 0.8984375,
"calib/auroc": 0.5901970284237726,
"calib/avg_num_step_conf": 6.640625,
"calib/ece": 0.35547826086956513,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.84375,
"calib/frac_conf_gt_0.9": 0.8826086956521739,
"calib/gap": -0.006333979328165262,
"calib/mean_conf": 0.9541739130434784,
"calib/mu_c": 0.9518055555555556,
"calib/mu_w": 0.9581395348837208,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.34178260869565213,
"calib/std_conf": 0.11421933597589791,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.8708485499462945,
"calib/step_q_c_n": 931.0,
"calib/step_q_gap": -0.004392022225357062,
"calib/step_q_w": 0.8752405721716515,
"calib/step_q_w_n": 769.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2471.0,
"completions/max_terminated_length": 2471.0,
"completions/mean_length": 777.17578125,
"completions/mean_terminated_length": 795.8280639648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 285.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.021602502092719078,
"kl": 0.1998291015625,
"learning_rate": 4.166666666666667e-06,
"loss": -0.1236,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.018588896840810776,
"mask/share_reasoning": 0.8642673492431641,
"mask/share_step_conf": 0.09370625764131546,
"num_tokens": 15499021.0,
"reward": 0.5040627717971802,
"reward_std": 0.42864006757736206,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5358277559280396,
"rewards/format_reward_step": 0.84375,
"rewards/step_correlation_reward": 0.19104784727096558,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7589243650436401,
"adv/mean_abs_reasoning": 0.6641005277633667,
"adv/mean_abs_step_conf": 0.807522177696228,
"adv/ratio_final_to_reasoning": 1.1427853665462846,
"adv/ratio_step_to_reasoning": 1.2159637644256858,
"adv/std_final_conf": 0.9353416562080383,
"adv/std_reasoning": 0.8905836939811707,
"adv/std_step_conf": 0.9368466138839722,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.540009250693802,
"calib/avg_num_step_conf": 6.625,
"calib/ece": 0.3523175965665237,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.859375,
"calib/frac_conf_gt_0.9": 0.8583690987124464,
"calib/gap": 0.01629201356768406,
"calib/mean_conf": 0.9574678111587984,
"calib/mu_c": 0.9639007092198584,
"calib/mu_w": 0.9476086956521743,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.3523175965665237,
"calib/std_conf": 0.06447563363765581,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.8706469298245614,
"calib/step_q_c_n": 912.0,
"calib/step_q_gap": -0.004671947726459069,
"calib/step_q_w": 0.8753188775510204,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2970.0,
"completions/max_terminated_length": 2970.0,
"completions/mean_length": 740.88671875,
"completions/mean_terminated_length": 755.6454467773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.0544,
"grad_norm": 0.023038936778903008,
"kl": 0.184295654296875,
"learning_rate": 4.138888888888889e-06,
"loss": -0.116,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.018784116953611374,
"mask/share_reasoning": 0.8652922511100769,
"mask/share_step_conf": 0.09639239311218262,
"num_tokens": 15797984.0,
"reward": 0.49212968349456787,
"reward_std": 0.4767027497291565,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5462964773178101,
"rewards/format_reward_step": 0.859375,
"rewards/step_correlation_reward": 0.15515035390853882,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7391918897628784,
"adv/mean_abs_reasoning": 0.6569032669067383,
"adv/mean_abs_step_conf": 0.7805554866790771,
"adv/ratio_final_to_reasoning": 1.1252674891443686,
"adv/ratio_step_to_reasoning": 1.1882350507322017,
"adv/std_final_conf": 0.9189611673355103,
"adv/std_reasoning": 0.8904977440834045,
"adv/std_step_conf": 0.9366629123687744,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.5567062818336164,
"calib/avg_num_step_conf": 6.96484375,
"calib/ece": 0.30367965367965377,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.859375,
"calib/frac_conf_gt_0.9": 0.8744588744588745,
"calib/gap": -0.001613752122241463,
"calib/mean_conf": 0.9556277056277057,
"calib/mu_c": 0.9550967741935481,
"calib/mu_w": 0.9567105263157896,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.2941558441558442,
"calib/std_conf": 0.09776471407915169,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.8801235839340886,
"calib/step_q_c_n": 971.0,
"calib/step_q_gap": 0.06337481546118218,
"calib/step_q_w": 0.8167487684729065,
"calib/step_q_w_n": 812.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 760.16796875,
"completions/mean_terminated_length": 775.310791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 323.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.021371837705373764,
"kl": 0.197784423828125,
"learning_rate": 4.111111111111111e-06,
"loss": -0.1177,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.01816929131746292,
"mask/share_reasoning": 0.8703060150146484,
"mask/share_step_conf": 0.09199343621730804,
"num_tokens": 16100539.0,
"reward": 0.6138618588447571,
"reward_std": 0.44294503331184387,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.5888835787773132,
"rewards/format_reward_step": 0.859375,
"rewards/step_correlation_reward": 0.3450901210308075,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7655702829360962,
"adv/mean_abs_reasoning": 0.6462835073471069,
"adv/mean_abs_step_conf": 0.7980622053146362,
"adv/ratio_final_to_reasoning": 1.1845734483905412,
"adv/ratio_step_to_reasoning": 1.2348484778616078,
"adv/std_final_conf": 0.9017614126205444,
"adv/std_reasoning": 0.843338668346405,
"adv/std_step_conf": 0.9365853071212769,
"calib/answer_extract_rate": 0.89453125,
"calib/auroc": 0.5494510282328336,
"calib/avg_num_step_conf": 6.3671875,
"calib/ece": 0.3045814977973569,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.87109375,
"calib/frac_conf_gt_0.9": 0.9074889867841409,
"calib/gap": 0.012929592192401529,
"calib/mean_conf": 0.9609691629955949,
"calib/mu_c": 0.9652980132450331,
"calib/mu_w": 0.9523684210526315,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.30017621145374457,
"calib/std_conf": 0.0841224926089066,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8756695464362853,
"calib/step_q_c_n": 926.0,
"calib/step_q_gap": -0.000807726290987576,
"calib/step_q_w": 0.8764772727272728,
"calib/step_q_w_n": 704.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3041.0,
"completions/max_terminated_length": 3041.0,
"completions/mean_length": 775.47265625,
"completions/mean_terminated_length": 790.9203491210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.018166445195674896,
"kl": 0.19354248046875,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0789,
"mask/has_final_conf_rate": 0.88671875,
"mask/share_final_conf": 0.017578979954123497,
"mask/share_reasoning": 0.8730362057685852,
"mask/share_step_conf": 0.08985357731580734,
"num_tokens": 16404884.0,
"reward": 0.5861221551895142,
"reward_std": 0.4507690966129303,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.5985612869262695,
"rewards/format_reward_step": 0.87109375,
"rewards/step_correlation_reward": 0.2807142436504364,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.7066130638122559,
"adv/mean_abs_reasoning": 0.6384649872779846,
"adv/mean_abs_step_conf": 0.7776536345481873,
"adv/ratio_final_to_reasoning": 1.1067373746285007,
"adv/ratio_step_to_reasoning": 1.218005137389939,
"adv/std_final_conf": 0.9105854630470276,
"adv/std_reasoning": 0.8750973343849182,
"adv/std_step_conf": 0.9368137121200562,
"calib/answer_extract_rate": 0.8984375,
"calib/auroc": 0.5397378694924707,
"calib/avg_num_step_conf": 7.51953125,
"calib/ece": 0.24951965065502194,
"calib/final_conf_rate": 0.89453125,
"calib/format_rate": 0.8671875,
"calib/frac_conf_gt_0.9": 0.8908296943231441,
"calib/gap": 0.04114612381483562,
"calib/mean_conf": 0.9551965065502184,
"calib/mu_c": 0.9670552147239265,
"calib/mu_w": 0.9259090909090909,
"calib/nonempty_final_conf_rate": 0.89453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.24646288209606995,
"calib/std_conf": 0.10126958610938055,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.8770415224913494,
"calib/step_q_c_n": 1156.0,
"calib/step_q_gap": -0.005130129004099371,
"calib/step_q_w": 0.8821716514954487,
"calib/step_q_w_n": 769.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3017.0,
"completions/max_terminated_length": 3017.0,
"completions/mean_length": 686.09765625,
"completions/mean_terminated_length": 708.2297973632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.0576,
"grad_norm": 0.0214396882802248,
"kl": 0.212432861328125,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0885,
"mask/has_final_conf_rate": 0.89453125,
"mask/share_final_conf": 0.019736099988222122,
"mask/share_reasoning": 0.8435460925102234,
"mask/share_step_conf": 0.1054677963256836,
"num_tokens": 16686757.0,
"reward": 0.6288886070251465,
"reward_std": 0.430873841047287,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.649349570274353,
"rewards/format_reward_step": 0.8671875,
"rewards/step_correlation_reward": 0.3076462745666504,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.7312518954277039,
"adv/mean_abs_reasoning": 0.5973133444786072,
"adv/mean_abs_step_conf": 0.7853642702102661,
"adv/ratio_final_to_reasoning": 1.2242349885318755,
"adv/ratio_step_to_reasoning": 1.314827933227924,
"adv/std_final_conf": 0.8862102627754211,
"adv/std_reasoning": 0.8268293142318726,
"adv/std_step_conf": 0.9367680549621582,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.6168327796234773,
"calib/avg_num_step_conf": 6.91796875,
"calib/ece": 0.412863247863248,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.8418803418803419,
"calib/gap": 0.02432115171650051,
"calib/mean_conf": 0.9555982905982908,
"calib/mu_c": 0.9665116279069768,
"calib/mu_w": 0.9421904761904762,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4085897435897437,
"calib/std_conf": 0.08376723548421357,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8804960541149945,
"calib/step_q_c_n": 887.0,
"calib/step_q_gap": -0.0022188780117021745,
"calib/step_q_w": 0.8827149321266967,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 737.4765625,
"completions/mean_terminated_length": 755.176025390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.024389022961258888,
"kl": 0.2047882080078125,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0954,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.01948772370815277,
"mask/share_reasoning": 0.854324221611023,
"mask/share_step_conf": 0.10275053232908249,
"num_tokens": 16983375.0,
"reward": 0.5285339951515198,
"reward_std": 0.42518290877342224,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5391316413879395,
"rewards/format_reward_step": 0.90625,
"rewards/step_correlation_reward": 0.2359049916267395,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7237775325775146,
"adv/mean_abs_reasoning": 0.5657122135162354,
"adv/mean_abs_step_conf": 0.7897391319274902,
"adv/ratio_final_to_reasoning": 1.2794094157501215,
"adv/ratio_step_to_reasoning": 1.3960086295800391,
"adv/std_final_conf": 0.8957918286323547,
"adv/std_reasoning": 0.7931345105171204,
"adv/std_step_conf": 0.9366181492805481,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.5529282687651331,
"calib/avg_num_step_conf": 6.6015625,
"calib/ece": 0.44352173913043497,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.8782608695652174,
"calib/gap": 0.02880750605326865,
"calib/mean_conf": 0.9565652173913044,
"calib/mu_c": 0.9705932203389831,
"calib/mu_w": 0.9417857142857144,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.44352173913043497,
"calib/std_conf": 0.08865631054169681,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8833725029377205,
"calib/step_q_c_n": 851.0,
"calib/step_q_gap": 0.008593003533667987,
"calib/step_q_w": 0.8747794994040525,
"calib/step_q_w_n": 839.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3007.0,
"completions/max_terminated_length": 3007.0,
"completions/mean_length": 801.71484375,
"completions/mean_terminated_length": 808.0275268554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.028454406186938286,
"kl": 0.1917572021484375,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0014,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.018351132050156593,
"mask/share_reasoning": 0.8761504292488098,
"mask/share_step_conf": 0.09768596291542053,
"num_tokens": 17295454.0,
"reward": 0.49251890182495117,
"reward_std": 0.3811336159706116,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.4930843710899353,
"rewards/format_reward_step": 0.87890625,
"rewards/step_correlation_reward": 0.22242222726345062,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.761801540851593,
"adv/mean_abs_reasoning": 0.6592216491699219,
"adv/mean_abs_step_conf": 0.7712447643280029,
"adv/ratio_final_to_reasoning": 1.155607589360631,
"adv/ratio_step_to_reasoning": 1.169932397243231,
"adv/std_final_conf": 0.9170172810554504,
"adv/std_reasoning": 0.8751125335693359,
"adv/std_step_conf": 0.9366140365600586,
"calib/answer_extract_rate": 0.90234375,
"calib/auroc": 0.6095045045045046,
"calib/avg_num_step_conf": 7.23828125,
"calib/ece": 0.2956053811659194,
"calib/final_conf_rate": 0.87109375,
"calib/format_rate": 0.85546875,
"calib/frac_conf_gt_0.9": 0.8385650224215246,
"calib/gap": 0.0452162162162163,
"calib/mean_conf": 0.9460089686098656,
"calib/mu_c": 0.9612162162162163,
"calib/mu_w": 0.916,
"calib/nonempty_final_conf_rate": 0.87109375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.28896860986547096,
"calib/std_conf": 0.11924177462795925,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8902629107981221,
"calib/step_q_c_n": 1065.0,
"calib/step_q_gap": 0.017839052930101795,
"calib/step_q_w": 0.8724238578680203,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2615.0,
"completions/max_terminated_length": 2615.0,
"completions/mean_length": 734.48046875,
"completions/mean_terminated_length": 758.17333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.0608,
"grad_norm": 0.019423753023147583,
"kl": 0.1970977783203125,
"learning_rate": 3.972222222222223e-06,
"loss": -0.098,
"mask/has_final_conf_rate": 0.87109375,
"mask/share_final_conf": 0.01847108080983162,
"mask/share_reasoning": 0.8484556674957275,
"mask/share_step_conf": 0.10182324051856995,
"num_tokens": 17590273.0,
"reward": 0.5865829586982727,
"reward_std": 0.43412959575653076,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6005644798278809,
"rewards/format_reward_step": 0.85546875,
"rewards/step_correlation_reward": 0.28275763988494873,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7713214159011841,
"adv/mean_abs_reasoning": 0.703680157661438,
"adv/mean_abs_step_conf": 0.8116668462753296,
"adv/ratio_final_to_reasoning": 1.096125004383441,
"adv/ratio_step_to_reasoning": 1.1534599028239862,
"adv/std_final_conf": 0.9167519807815552,
"adv/std_reasoning": 0.8905022740364075,
"adv/std_step_conf": 0.9369077086448669,
"calib/answer_extract_rate": 0.90234375,
"calib/auroc": 0.5551075268817205,
"calib/avg_num_step_conf": 7.24609375,
"calib/ece": 0.39802765647743815,
"calib/final_conf_rate": 0.89453125,
"calib/format_rate": 0.875,
"calib/frac_conf_gt_0.9": 0.759825327510917,
"calib/gap": 0.030838069636456744,
"calib/mean_conf": 0.9281586608442505,
"calib/mu_c": 0.9422983870967742,
"calib/mu_w": 0.9114603174603174,
"calib/nonempty_final_conf_rate": 0.89453125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.39235080058224164,
"calib/std_conf": 0.1266060092216613,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8813956310679611,
"calib/step_q_c_n": 824.0,
"calib/step_q_gap": 0.002439924634078161,
"calib/step_q_w": 0.878955706433883,
"calib/step_q_w_n": 1031.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3041.0,
"completions/max_terminated_length": 3041.0,
"completions/mean_length": 830.83203125,
"completions/mean_terminated_length": 857.633056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 386.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.02321190945804119,
"kl": 0.1953125,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0503,
"mask/has_final_conf_rate": 0.89453125,
"mask/share_final_conf": 0.017203208059072495,
"mask/share_reasoning": 0.8562976121902466,
"mask/share_step_conf": 0.09524921327829361,
"num_tokens": 17909286.0,
"reward": 0.4562646150588989,
"reward_std": 0.4806906580924988,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5300267934799194,
"rewards/format_reward_step": 0.875,
"rewards/step_correlation_reward": 0.10906495898962021,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.7684636116027832,
"adv/mean_abs_reasoning": 0.6147791743278503,
"adv/mean_abs_step_conf": 0.7819816470146179,
"adv/ratio_final_to_reasoning": 1.2499831544277,
"adv/ratio_step_to_reasoning": 1.2719715951171788,
"adv/std_final_conf": 0.9189398288726807,
"adv/std_reasoning": 0.8433147072792053,
"adv/std_step_conf": 0.9365225434303284,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.5273282442748091,
"calib/avg_num_step_conf": 6.75390625,
"calib/ece": 0.37584415584415587,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.7662337662337663,
"calib/gap": 0.020382442748091534,
"calib/mean_conf": 0.9144588744588745,
"calib/mu_c": 0.9232824427480916,
"calib/mu_w": 0.9029,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.36160173160173165,
"calib/std_conf": 0.1602447716364824,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8843528561099061,
"calib/step_q_c_n": 922.0,
"calib/step_q_gap": 0.00017689576294210863,
"calib/step_q_w": 0.884175960346964,
"calib/step_q_w_n": 807.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2828.0,
"completions/max_terminated_length": 2828.0,
"completions/mean_length": 772.96484375,
"completions/mean_terminated_length": 788.362548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.021945487707853317,
"kl": 0.2040863037109375,
"learning_rate": 3.916666666666667e-06,
"loss": -0.105,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.01884847693145275,
"mask/share_reasoning": 0.8630433082580566,
"mask/share_step_conf": 0.09857693314552307,
"num_tokens": 18213413.0,
"reward": 0.5048579573631287,
"reward_std": 0.44036370515823364,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5499148368835449,
"rewards/format_reward_step": 0.890625,
"rewards/step_correlation_reward": 0.17698858678340912,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.7791723608970642,
"adv/mean_abs_reasoning": 0.6329578161239624,
"adv/mean_abs_step_conf": 0.780436098575592,
"adv/ratio_final_to_reasoning": 1.2310020368631742,
"adv/ratio_step_to_reasoning": 1.2329985959486858,
"adv/std_final_conf": 0.9260181784629822,
"adv/std_reasoning": 0.8593480587005615,
"adv/std_step_conf": 0.9366816878318787,
"calib/answer_extract_rate": 0.89453125,
"calib/auroc": 0.5887887413029728,
"calib/avg_num_step_conf": 7.30078125,
"calib/ece": 0.373849557522124,
"calib/final_conf_rate": 0.8828125,
"calib/format_rate": 0.859375,
"calib/frac_conf_gt_0.9": 0.6858407079646017,
"calib/gap": 0.0424019607843138,
"calib/mean_conf": 0.9083628318584072,
"calib/mu_c": 0.9275000000000001,
"calib/mu_w": 0.8850980392156863,
"calib/nonempty_final_conf_rate": 0.8828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3667699115044249,
"calib/std_conf": 0.15408729636707594,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8931798245614035,
"calib/step_q_c_n": 912.0,
"calib/step_q_gap": 0.002093095198812067,
"calib/step_q_w": 0.8910867293625915,
"calib/step_q_w_n": 957.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 726.265625,
"completions/mean_terminated_length": 765.1193237304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 342.0,
"epoch": 0.064,
"grad_norm": 0.019068323075771332,
"kl": 0.208465576171875,
"learning_rate": 3.88888888888889e-06,
"loss": -0.1865,
"mask/has_final_conf_rate": 0.8828125,
"mask/share_final_conf": 0.018185026943683624,
"mask/share_reasoning": 0.8298521041870117,
"mask/share_step_conf": 0.10118165612220764,
"num_tokens": 18508193.0,
"reward": 0.5068765878677368,
"reward_std": 0.44211721420288086,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5381327867507935,
"rewards/format_reward_step": 0.859375,
"rewards/step_correlation_reward": 0.20530793070793152,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.7496904730796814,
"adv/mean_abs_reasoning": 0.501232922077179,
"adv/mean_abs_step_conf": 0.7808792591094971,
"adv/ratio_final_to_reasoning": 1.4956928008097707,
"adv/ratio_step_to_reasoning": 1.5579169378448343,
"adv/std_final_conf": 0.919006884098053,
"adv/std_reasoning": 0.757882297039032,
"adv/std_step_conf": 0.9365785121917725,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5127613661730783,
"calib/avg_num_step_conf": 8.12109375,
"calib/ece": 0.2959583333333334,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.6625,
"calib/gap": 0.003881985266760757,
"calib/mean_conf": 0.9063750000000002,
"calib/mu_c": 0.9078145695364237,
"calib/mu_w": 0.9039325842696629,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.28658333333333336,
"calib/std_conf": 0.1391513901295995,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8957231588287489,
"calib/step_q_c_n": 1127.0,
"calib/step_q_gap": -0.006850370583015852,
"calib/step_q_w": 0.9025735294117647,
"calib/step_q_w_n": 952.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2137.0,
"completions/max_terminated_length": 2137.0,
"completions/mean_length": 700.57421875,
"completions/mean_terminated_length": 711.6944580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.020417187362909317,
"kl": 0.203521728515625,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0435,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.020129330456256866,
"mask/share_reasoning": 0.8452743887901306,
"mask/share_step_conf": 0.11897126585245132,
"num_tokens": 18791604.0,
"reward": 0.581881582736969,
"reward_std": 0.37869957089424133,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6180222630500793,
"rewards/format_reward_step": 0.91796875,
"rewards/step_correlation_reward": 0.24261584877967834,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.8006377816200256,
"adv/mean_abs_reasoning": 0.7234856486320496,
"adv/mean_abs_step_conf": 0.8082146644592285,
"adv/ratio_final_to_reasoning": 1.106639479489128,
"adv/ratio_step_to_reasoning": 1.117112227433098,
"adv/std_final_conf": 0.9264470338821411,
"adv/std_reasoning": 0.8750442862510681,
"adv/std_step_conf": 0.9368706345558167,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5740320473144901,
"calib/avg_num_step_conf": 8.84375,
"calib/ece": 0.3516322314049586,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5537190082644629,
"calib/gap": 0.03770751667698258,
"calib/mean_conf": 0.8761776859504133,
"calib/mu_c": 0.8934732824427483,
"calib/mu_w": 0.8557657657657657,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3432438016528925,
"calib/std_conf": 0.16861234035581443,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.893684749232344,
"calib/step_q_c_n": 977.0,
"calib/step_q_gap": 0.00502119056878525,
"calib/step_q_w": 0.8886635586635587,
"calib/step_q_w_n": 1287.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 789.6796875,
"completions/mean_terminated_length": 799.0435180664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.02827797457575798,
"kl": 0.198089599609375,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0186,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018833544105291367,
"mask/share_reasoning": 0.8600931763648987,
"mask/share_step_conf": 0.10935452580451965,
"num_tokens": 19100842.0,
"reward": 0.50331711769104,
"reward_std": 0.4939599633216858,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5874221324920654,
"rewards/format_reward_step": 0.9296875,
"rewards/step_correlation_reward": 0.13093087077140808,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7669820785522461,
"adv/mean_abs_reasoning": 0.5731774568557739,
"adv/mean_abs_step_conf": 0.7635728120803833,
"adv/ratio_final_to_reasoning": 1.3381232450410874,
"adv/ratio_step_to_reasoning": 1.332175232900895,
"adv/std_final_conf": 0.9358986616134644,
"adv/std_reasoning": 0.8267452120780945,
"adv/std_step_conf": 0.9365659356117249,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6514564450287138,
"calib/avg_num_step_conf": 6.92578125,
"calib/ece": 0.28414634146341466,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.45528455284552843,
"calib/gap": 0.07411056528056459,
"calib/mean_conf": 0.8356097560975609,
"calib/mu_c": 0.8648322147651007,
"calib/mu_w": 0.7907216494845362,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2570325203252033,
"calib/std_conf": 0.20707661808730868,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8932848837209304,
"calib/step_q_c_n": 1032.0,
"calib/step_q_gap": 0.0033388648275429755,
"calib/step_q_w": 0.8899460188933874,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2221.0,
"completions/max_terminated_length": 2221.0,
"completions/mean_length": 775.03125,
"completions/mean_terminated_length": 790.4701538085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.0672,
"grad_norm": 0.022803494706749916,
"kl": 0.1893310546875,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0539,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019247833639383316,
"mask/share_reasoning": 0.8624863028526306,
"mask/share_step_conf": 0.09873461723327637,
"num_tokens": 19407890.0,
"reward": 0.6503553986549377,
"reward_std": 0.4144784212112427,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6726309061050415,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.3210486173629761,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7718286514282227,
"adv/mean_abs_reasoning": 0.6045994162559509,
"adv/mean_abs_step_conf": 0.7827744483947754,
"adv/ratio_final_to_reasoning": 1.2765950986321775,
"adv/ratio_step_to_reasoning": 1.2946993122193091,
"adv/std_final_conf": 0.9342676997184753,
"adv/std_reasoning": 0.8102007508277893,
"adv/std_step_conf": 0.9365555644035339,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5747526283240568,
"calib/avg_num_step_conf": 7.97265625,
"calib/ece": 0.2109795918367347,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5061224489795918,
"calib/gap": 0.055535714285714355,
"calib/mean_conf": 0.8552244897959184,
"calib/mu_c": 0.8726785714285715,
"calib/mu_w": 0.8171428571428572,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1902448979591837,
"calib/std_conf": 0.18717788824866435,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8956170328180094,
"calib/step_q_c_n": 1229.0,
"calib/step_q_gap": -0.016465479497261515,
"calib/step_q_w": 0.9120825123152709,
"calib/step_q_w_n": 812.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 722.1171875,
"completions/mean_terminated_length": 742.4176635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.01971907913684845,
"kl": 0.207366943359375,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0808,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02002081833779812,
"mask/share_reasoning": 0.8429508209228516,
"mask/share_step_conf": 0.10968463122844696,
"num_tokens": 19696528.0,
"reward": 0.7009572982788086,
"reward_std": 0.45025748014450073,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.708678126335144,
"rewards/format_reward_step": 0.94921875,
"rewards/step_correlation_reward": 0.3721427321434021,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.705558180809021,
"adv/mean_abs_reasoning": 0.4466238021850586,
"adv/mean_abs_step_conf": 0.7446850538253784,
"adv/ratio_final_to_reasoning": 1.579759469506896,
"adv/ratio_step_to_reasoning": 1.6673653535303927,
"adv/std_final_conf": 0.9230350255966187,
"adv/std_reasoning": 0.7576389312744141,
"adv/std_step_conf": 0.9363834857940674,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5003857212988287,
"calib/avg_num_step_conf": 8.44921875,
"calib/ece": 0.31627049180327876,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5614754098360656,
"calib/gap": -0.0045255628024407635,
"calib/mean_conf": 0.8707786885245902,
"calib/mu_c": 0.8689795918367346,
"calib/mu_w": 0.8735051546391753,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2922950819672132,
"calib/std_conf": 0.176158368218379,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8958239700374532,
"calib/step_q_c_n": 1068.0,
"calib/step_q_gap": -0.015418039094966973,
"calib/step_q_w": 0.9112420091324201,
"calib/step_q_w_n": 1095.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 678.75,
"completions/mean_terminated_length": 697.831298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.02425149828195572,
"kl": 0.190521240234375,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0316,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.021705515682697296,
"mask/share_reasoning": 0.8342607617378235,
"mask/share_step_conf": 0.11668997257947922,
"num_tokens": 19975312.0,
"reward": 0.6031345725059509,
"reward_std": 0.3473690152168274,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6245855093002319,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.2762148082256317,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.7633180618286133,
"adv/mean_abs_reasoning": 0.5400905609130859,
"adv/mean_abs_step_conf": 0.7656220197677612,
"adv/ratio_final_to_reasoning": 1.4133149458085978,
"adv/ratio_step_to_reasoning": 1.4175808191748216,
"adv/std_final_conf": 0.9359403848648071,
"adv/std_reasoning": 0.7928449511528015,
"adv/std_step_conf": 0.9361146688461304,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.6635693006100156,
"calib/avg_num_step_conf": 7.98046875,
"calib/ece": 0.3714086471408645,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.37656903765690375,
"calib/gap": 0.08457393483709286,
"calib/mean_conf": 0.8112691771269176,
"calib/mu_c": 0.8583333333333334,
"calib/mu_w": 0.7737593984962405,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3695815899581588,
"calib/std_conf": 0.19077935433179818,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8942408376963351,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": -0.010505057534313855,
"calib/step_q_w": 0.904745895230649,
"calib/step_q_w_n": 1279.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2857.0,
"completions/max_terminated_length": 2857.0,
"completions/mean_length": 803.25,
"completions/mean_terminated_length": 832.5182495117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.0704,
"grad_norm": 0.028425876051187515,
"kl": 0.188751220703125,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0473,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.018159134313464165,
"mask/share_reasoning": 0.8508436679840088,
"mask/share_step_conf": 0.0958409234881401,
"num_tokens": 20287296.0,
"reward": 0.48330575227737427,
"reward_std": 0.3188536763191223,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/final_brier_reward_step": 0.5817521810531616,
"rewards/format_reward_step": 0.9296875,
"rewards/step_correlation_reward": 0.11610931158065796,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7104264497756958,
"adv/mean_abs_reasoning": 0.35727912187576294,
"adv/mean_abs_step_conf": 0.748501181602478,
"adv/ratio_final_to_reasoning": 1.9884353892437443,
"adv/ratio_step_to_reasoning": 2.0950039780459244,
"adv/std_final_conf": 0.9293177127838135,
"adv/std_reasoning": 0.6814470887184143,
"adv/std_step_conf": 0.9362070560455322,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.693968949044586,
"calib/avg_num_step_conf": 7.41015625,
"calib/ece": 0.1912648221343874,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3241106719367589,
"calib/gap": 0.1331303078556264,
"calib/mean_conf": 0.8086561264822133,
"calib/mu_c": 0.8591719745222931,
"calib/mu_w": 0.7260416666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1896837944664032,
"calib/std_conf": 0.1835394550318141,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8963327526132404,
"calib/step_q_c_n": 1148.0,
"calib/step_q_gap": -0.004174590510925169,
"calib/step_q_w": 0.9005073431241656,
"calib/step_q_w_n": 749.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 772.74609375,
"completions/mean_terminated_length": 778.8306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.039756689220666885,
"kl": 0.189849853515625,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0167,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01978326216340065,
"mask/share_reasoning": 0.8672667145729065,
"mask/share_step_conf": 0.10513752698898315,
"num_tokens": 20590127.0,
"reward": 0.6821687817573547,
"reward_std": 0.2857590317726135,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7492746114730835,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.2947504222393036,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.7532162666320801,
"adv/mean_abs_reasoning": 0.5838368535041809,
"adv/mean_abs_step_conf": 0.7709550857543945,
"adv/ratio_final_to_reasoning": 1.290114288111972,
"adv/ratio_step_to_reasoning": 1.320497466247861,
"adv/std_final_conf": 0.9340994954109192,
"adv/std_reasoning": 0.826541006565094,
"adv/std_step_conf": 0.9367088079452515,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6727183833116037,
"calib/avg_num_step_conf": 7.7109375,
"calib/ece": 0.31544354838709676,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3870967741935484,
"calib/gap": 0.09035071707953068,
"calib/mean_conf": 0.8143951612903227,
"calib/mu_c": 0.8573846153846152,
"calib/mu_w": 0.7670338983050845,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30282258064516127,
"calib/std_conf": 0.20217077253379004,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8955928853754942,
"calib/step_q_c_n": 1012.0,
"calib/step_q_gap": 0.0017571265397352542,
"calib/step_q_w": 0.8938357588357589,
"calib/step_q_w_n": 962.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2498.0,
"completions/max_terminated_length": 2498.0,
"completions/mean_length": 742.51953125,
"completions/mean_terminated_length": 751.3241577148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.9252099990844727,
"kl": 0.35418701171875,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0128,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02056063711643219,
"mask/share_reasoning": 0.853926956653595,
"mask/share_step_conf": 0.11379365622997284,
"num_tokens": 20884300.0,
"reward": 0.5387983322143555,
"reward_std": 0.3924047648906708,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6455503702163696,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.13829626142978668,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7773676514625549,
"adv/mean_abs_reasoning": 0.5483871102333069,
"adv/mean_abs_step_conf": 0.7789148092269897,
"adv/ratio_final_to_reasoning": 1.4175527414053006,
"adv/ratio_step_to_reasoning": 1.4203740290241444,
"adv/std_final_conf": 0.9361997246742249,
"adv/std_reasoning": 0.7928118109703064,
"adv/std_step_conf": 0.9365246295928955,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6481659304782024,
"calib/avg_num_step_conf": 7.12890625,
"calib/ece": 0.26452,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.22,
"calib/gap": 0.10831380833493387,
"calib/mean_conf": 0.7264400000000001,
"calib/mu_c": 0.7814634146341464,
"calib/mu_w": 0.6731496062992125,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24947999999999998,
"calib/std_conf": 0.22698838384375533,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8946553672316385,
"calib/step_q_c_n": 885.0,
"calib/step_q_gap": 0.004868133189085189,
"calib/step_q_w": 0.8897872340425533,
"calib/step_q_w_n": 940.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2023.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 792.01953125,
"completions/mean_terminated_length": 801.4110717773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.0736,
"grad_norm": 0.03803575411438942,
"kl": 0.18316650390625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0205,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018899952992796898,
"mask/share_reasoning": 0.8710591793060303,
"mask/share_step_conf": 0.09832212328910828,
"num_tokens": 21191553.0,
"reward": 0.5194801688194275,
"reward_std": 0.36671119928359985,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.681368350982666,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.06618569791316986,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7430238723754883,
"adv/mean_abs_reasoning": 0.5215783715248108,
"adv/mean_abs_step_conf": 0.7823535203933716,
"adv/ratio_final_to_reasoning": 1.4245680283929174,
"adv/ratio_step_to_reasoning": 1.4999730876612012,
"adv/std_final_conf": 0.919750452041626,
"adv/std_reasoning": 0.7754024267196655,
"adv/std_step_conf": 0.9365573525428772,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7753010845698316,
"calib/avg_num_step_conf": 7.51953125,
"calib/ece": 0.21260162601626023,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3170731707317073,
"calib/gap": 0.20759198882161145,
"calib/mean_conf": 0.7467479674796748,
"calib/mu_c": 0.8421052631578947,
"calib/mu_w": 0.6345132743362832,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20934959349593502,
"calib/std_conf": 0.23748682782457178,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8947928994082841,
"calib/step_q_c_n": 1014.0,
"calib/step_q_gap": -0.009140507104705264,
"calib/step_q_w": 0.9039334065129894,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 795.75390625,
"completions/mean_terminated_length": 808.3849487304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.042088743299245834,
"kl": 0.1686248779296875,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0327,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01939772255718708,
"mask/share_reasoning": 0.8597534894943237,
"mask/share_step_conf": 0.10522376000881195,
"num_tokens": 21502258.0,
"reward": 0.6089703440666199,
"reward_std": 0.3306017518043518,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7263593673706055,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.19548749923706055,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7810891270637512,
"adv/mean_abs_reasoning": 0.5818857550621033,
"adv/mean_abs_step_conf": 0.7901183366775513,
"adv/ratio_final_to_reasoning": 1.3423410356219967,
"adv/ratio_step_to_reasoning": 1.3578581874602238,
"adv/std_final_conf": 0.9348444938659668,
"adv/std_reasoning": 0.8429794311523438,
"adv/std_step_conf": 0.9366829991340637,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6373737373737374,
"calib/avg_num_step_conf": 7.73046875,
"calib/ece": 0.23112925170068033,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3306122448979592,
"calib/gap": 0.11414590347923692,
"calib/mean_conf": 0.6934421768707483,
"calib/mu_c": 0.7446913580246913,
"calib/mu_w": 0.6305454545454544,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1867755102040817,
"calib/std_conf": 0.28514449046638407,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8991023166023168,
"calib/step_q_c_n": 1036.0,
"calib/step_q_gap": 0.014489379168594496,
"calib/step_q_w": 0.8846129374337223,
"calib/step_q_w_n": 943.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 772.3671875,
"completions/mean_terminated_length": 794.0802612304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.028518889099359512,
"kl": 0.1798095703125,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0375,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019431080669164658,
"mask/share_reasoning": 0.8467603325843811,
"mask/share_step_conf": 0.10646484792232513,
"num_tokens": 21804392.0,
"reward": 0.607774019241333,
"reward_std": 0.3878346085548401,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6770904660224915,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.24158260226249695,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7352126836776733,
"adv/mean_abs_reasoning": 0.49225544929504395,
"adv/mean_abs_step_conf": 0.7592939138412476,
"adv/ratio_final_to_reasoning": 1.4935592581668053,
"adv/ratio_step_to_reasoning": 1.5424794482796031,
"adv/std_final_conf": 0.9240669012069702,
"adv/std_reasoning": 0.7753722667694092,
"adv/std_step_conf": 0.9365233778953552,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7154524241638581,
"calib/avg_num_step_conf": 7.4140625,
"calib/ece": 0.26682730923694775,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.3534136546184739,
"calib/gap": 0.19477119522945296,
"calib/mean_conf": 0.723293172690763,
"calib/mu_c": 0.8273275862068965,
"calib/mu_w": 0.6325563909774435,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.26212851405622484,
"calib/std_conf": 0.26641227523362154,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9019730941704036,
"calib/step_q_c_n": 892.0,
"calib/step_q_gap": 0.004030748245950289,
"calib/step_q_w": 0.8979423459244533,
"calib/step_q_w_n": 1006.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 753.2734375,
"completions/mean_terminated_length": 762.20556640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.0768,
"grad_norm": 0.03393369913101196,
"kl": 0.193572998046875,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0079,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019757676869630814,
"mask/share_reasoning": 0.8599628210067749,
"mask/share_step_conf": 0.1085607260465622,
"num_tokens": 22101638.0,
"reward": 0.5681977868080139,
"reward_std": 0.36324363946914673,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6912593841552734,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.16076117753982544,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7578251361846924,
"adv/mean_abs_reasoning": 0.6473281383514404,
"adv/mean_abs_step_conf": 0.7835944890975952,
"adv/ratio_final_to_reasoning": 1.1706970410936504,
"adv/ratio_step_to_reasoning": 1.2105058357160037,
"adv/std_final_conf": 0.9243986010551453,
"adv/std_reasoning": 0.8267653584480286,
"adv/std_step_conf": 0.9366769194602966,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6147632129774986,
"calib/avg_num_step_conf": 7.69921875,
"calib/ece": 0.19759842519685045,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.38188976377952755,
"calib/gap": 0.10122710622710618,
"calib/mean_conf": 0.7414566929133858,
"calib/mu_c": 0.7805128205128204,
"calib/mu_w": 0.6792857142857143,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16244094488188981,
"calib/std_conf": 0.2645569839010028,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8973987941429803,
"calib/step_q_c_n": 1161.0,
"calib/step_q_gap": -0.006317255239735919,
"calib/step_q_w": 0.9037160493827162,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 726.265625,
"completions/mean_terminated_length": 726.265625,
"completions/min_length": 285.0,
"completions/min_terminated_length": 285.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.024225082248449326,
"kl": 0.1775665283203125,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.1016,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020709872245788574,
"mask/share_reasoning": 0.8673610091209412,
"mask/share_step_conf": 0.11192911863327026,
"num_tokens": 22394594.0,
"reward": 0.6562621593475342,
"reward_std": 0.42811983823776245,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7120077610015869,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.2825477719306946,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7497447729110718,
"adv/mean_abs_reasoning": 0.6239073872566223,
"adv/mean_abs_step_conf": 0.7768936157226562,
"adv/ratio_final_to_reasoning": 1.2016924117660601,
"adv/ratio_step_to_reasoning": 1.2452066309692666,
"adv/std_final_conf": 0.917289137840271,
"adv/std_reasoning": 0.8591247797012329,
"adv/std_step_conf": 0.9366591572761536,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6838306395911037,
"calib/avg_num_step_conf": 8.015625,
"calib/ece": 0.23120331950207476,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.3900414937759336,
"calib/gap": 0.1913862411935351,
"calib/mean_conf": 0.6926970954356846,
"calib/mu_c": 0.7832283464566929,
"calib/mu_w": 0.5918421052631578,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1984647302904565,
"calib/std_conf": 0.3053855931948699,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9011902339776197,
"calib/step_q_c_n": 983.0,
"calib/step_q_gap": -0.0033934891280865687,
"calib/step_q_w": 0.9045837231057062,
"calib/step_q_w_n": 1069.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 732.9140625,
"completions/mean_terminated_length": 753.51806640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 309.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.040199581533670425,
"kl": 0.211181640625,
"learning_rate": 3.5e-06,
"loss": -0.0316,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.02010929211974144,
"mask/share_reasoning": 0.8409967422485352,
"mask/share_step_conf": 0.11155019700527191,
"num_tokens": 22686148.0,
"reward": 0.5797815322875977,
"reward_std": 0.39971089363098145,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.68291175365448,
"rewards/format_reward_step": 0.94140625,
"rewards/step_correlation_reward": 0.18915137648582458,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7649242877960205,
"adv/mean_abs_reasoning": 0.5034365653991699,
"adv/mean_abs_step_conf": 0.7570182681083679,
"adv/ratio_final_to_reasoning": 1.5194055028353364,
"adv/ratio_step_to_reasoning": 1.5037013998141664,
"adv/std_final_conf": 0.9224418997764587,
"adv/std_reasoning": 0.7577688694000244,
"adv/std_step_conf": 0.9364383220672607,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.741899908731366,
"calib/avg_num_step_conf": 7.64453125,
"calib/ece": 0.14323962516733602,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.42971887550200805,
"calib/gap": 0.28359851941993697,
"calib/mean_conf": 0.6899330655957163,
"calib/mu_c": 0.7764932562620422,
"calib/mu_w": 0.4928947368421053,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06919678714859441,
"calib/std_conf": 0.32527041573984733,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.89967542503864,
"calib/step_q_c_n": 1294.0,
"calib/step_q_gap": 0.0060354049280315625,
"calib/step_q_w": 0.8936400201106084,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2011.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 680.83203125,
"completions/mean_terminated_length": 697.1720581054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.08,
"grad_norm": 0.06415676325559616,
"kl": 0.205810546875,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0964,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.021222256124019623,
"mask/share_reasoning": 0.8401088714599609,
"mask/share_step_conf": 0.11523135751485825,
"num_tokens": 22965193.0,
"reward": 0.7654494047164917,
"reward_std": 0.35930517315864563,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7767044901847839,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.4252881109714508,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.7344155311584473,
"adv/mean_abs_reasoning": 0.45388472080230713,
"adv/mean_abs_step_conf": 0.7792856097221375,
"adv/ratio_final_to_reasoning": 1.6180662126283105,
"adv/ratio_step_to_reasoning": 1.7169240866813869,
"adv/std_final_conf": 0.9143033027648926,
"adv/std_reasoning": 0.7392937541007996,
"adv/std_step_conf": 0.9359453916549683,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7301700680272109,
"calib/avg_num_step_conf": 7.4296875,
"calib/ece": 0.14139784946236564,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3870967741935484,
"calib/gap": 0.2686938775510206,
"calib/mean_conf": 0.639489247311828,
"calib/mu_c": 0.7456666666666668,
"calib/mu_w": 0.4769727891156462,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08802419354838713,
"calib/std_conf": 0.332289102101497,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9007261592300965,
"calib/step_q_c_n": 1143.0,
"calib/step_q_gap": 0.0097907178598724,
"calib/step_q_w": 0.8909354413702241,
"calib/step_q_w_n": 759.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2774.0,
"completions/max_terminated_length": 2774.0,
"completions/mean_length": 757.26953125,
"completions/mean_terminated_length": 769.2897338867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 349.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.09344251453876495,
"kl": 0.189422607421875,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0008,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.020103726536035538,
"mask/share_reasoning": 0.8557681441307068,
"mask/share_step_conf": 0.10850309580564499,
"num_tokens": 23262110.0,
"reward": 0.6588497757911682,
"reward_std": 0.3028367757797241,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7497581839561462,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.25700393319129944,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7352517247200012,
"adv/mean_abs_reasoning": 0.5148537755012512,
"adv/mean_abs_step_conf": 0.7778766751289368,
"adv/ratio_final_to_reasoning": 1.4280787277983442,
"adv/ratio_step_to_reasoning": 1.5108691285629823,
"adv/std_final_conf": 0.8905107975006104,
"adv/std_reasoning": 0.7576778531074524,
"adv/std_step_conf": 0.9358101487159729,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6792625045637094,
"calib/avg_num_step_conf": 8.6640625,
"calib/ece": 0.22512768817204304,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5201612903225806,
"calib/gap": 0.1941892418157477,
"calib/mean_conf": 0.7020900537634409,
"calib/mu_c": 0.767080808080808,
"calib/mu_w": 0.5728915662650603,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13094758064516132,
"calib/std_conf": 0.34288262180861306,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.902955700798838,
"calib/step_q_c_n": 1377.0,
"calib/step_q_gap": 0.0015288280283265454,
"calib/step_q_w": 0.9014268727705115,
"calib/step_q_w_n": 841.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 763.625,
"completions/mean_terminated_length": 778.836669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.0820995420217514,
"kl": 0.18267822265625,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0688,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.020010104402899742,
"mask/share_reasoning": 0.841918408870697,
"mask/share_step_conf": 0.11854026466608047,
"num_tokens": 23562262.0,
"reward": 0.6911901831626892,
"reward_std": 0.33868926763534546,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7139571905136108,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.34732943773269653,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7347139716148376,
"adv/mean_abs_reasoning": 0.5040462017059326,
"adv/mean_abs_step_conf": 0.7889659404754639,
"adv/ratio_final_to_reasoning": 1.4576321954777465,
"adv/ratio_step_to_reasoning": 1.5652651241200253,
"adv/std_final_conf": 0.9141965508460999,
"adv/std_reasoning": 0.739403486251831,
"adv/std_step_conf": 0.9360442757606506,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6120240752011903,
"calib/avg_num_step_conf": 8.15625,
"calib/ece": 0.2332142857142857,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.17125786163522017,
"calib/mean_conf": 0.7713888888888889,
"calib/mu_c": 0.8345911949685534,
"calib/mu_w": 0.6633333333333332,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1868253968253968,
"calib/std_conf": 0.3135410525315225,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9005529225908372,
"calib/step_q_c_n": 1266.0,
"calib/step_q_gap": -0.005360702713299137,
"calib/step_q_w": 0.9059136253041363,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1565.0,
"completions/max_terminated_length": 1565.0,
"completions/mean_length": 777.8984375,
"completions/mean_terminated_length": 790.24609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.0832,
"grad_norm": 0.07548096030950546,
"kl": 0.18994140625,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0264,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01880320906639099,
"mask/share_reasoning": 0.8600537776947021,
"mask/share_step_conf": 0.10551798343658447,
"num_tokens": 23869428.0,
"reward": 0.6681511402130127,
"reward_std": 0.33723169565200806,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7137347459793091,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.30225497484207153,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.6985340118408203,
"adv/mean_abs_reasoning": 0.5203701257705688,
"adv/mean_abs_step_conf": 0.7808733582496643,
"adv/ratio_final_to_reasoning": 1.3423791590772525,
"adv/ratio_step_to_reasoning": 1.5006114294000639,
"adv/std_final_conf": 0.8765206336975098,
"adv/std_reasoning": 0.7576753497123718,
"adv/std_step_conf": 0.9352340698242188,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6553852414541509,
"calib/avg_num_step_conf": 7.7421875,
"calib/ece": 0.3181392235609105,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8152610441767069,
"calib/gap": 0.10126853861457752,
"calib/mean_conf": 0.8972824631860777,
"calib/mu_c": 0.9367324561403508,
"calib/mu_w": 0.8354639175257733,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3024899598393576,
"calib/std_conf": 0.22779343355685486,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9020017482517484,
"calib/step_q_c_n": 1144.0,
"calib/step_q_gap": 0.002144946342440446,
"calib/step_q_w": 0.899856801909308,
"calib/step_q_w_n": 838.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 762.0703125,
"completions/mean_terminated_length": 777.2510375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.07747872173786163,
"kl": 0.3189697265625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0463,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01893690787255764,
"mask/share_reasoning": 0.8566397428512573,
"mask/share_step_conf": 0.10489208251237869,
"num_tokens": 24170894.0,
"reward": 0.6061649322509766,
"reward_std": 0.3817436099052429,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6577041149139404,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.24134458601474762,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6666533946990967,
"adv/mean_abs_reasoning": 0.5015234351158142,
"adv/mean_abs_step_conf": 0.7864340543746948,
"adv/ratio_final_to_reasoning": 1.3292567166779552,
"adv/ratio_step_to_reasoning": 1.5680903409689873,
"adv/std_final_conf": 0.8309153318405151,
"adv/std_reasoning": 0.7394514679908752,
"adv/std_step_conf": 0.9357711672782898,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5628825197790714,
"calib/avg_num_step_conf": 8.39453125,
"calib/ece": 0.261593625498008,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.8924302788844621,
"calib/gap": 0.08233318405732204,
"calib/mean_conf": 0.935776892430279,
"calib/mu_c": 0.9610344827586207,
"calib/mu_w": 0.8787012987012986,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25207171314741045,
"calib/std_conf": 0.1765681796614607,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9052186379928316,
"calib/step_q_c_n": 1395.0,
"calib/step_q_gap": -0.011293298346691016,
"calib/step_q_w": 0.9165119363395227,
"calib/step_q_w_n": 754.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2486.0,
"completions/max_terminated_length": 2486.0,
"completions/mean_length": 728.54296875,
"completions/mean_terminated_length": 731.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.02758839726448059,
"kl": 0.171112060546875,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0601,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020696930587291718,
"mask/share_reasoning": 0.854810357093811,
"mask/share_step_conf": 0.12058646976947784,
"num_tokens": 24459561.0,
"reward": 0.6907826662063599,
"reward_std": 0.3672957420349121,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.714138925075531,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.336176335811615,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6626981496810913,
"adv/mean_abs_reasoning": 0.512515664100647,
"adv/mean_abs_step_conf": 0.7866781949996948,
"adv/ratio_final_to_reasoning": 1.293030040055423,
"adv/ratio_step_to_reasoning": 1.5349349299989556,
"adv/std_final_conf": 0.8433207869529724,
"adv/std_reasoning": 0.7756554484367371,
"adv/std_step_conf": 0.9362679719924927,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5691447696361177,
"calib/avg_num_step_conf": 7.8984375,
"calib/ece": 0.36096994535519134,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8934426229508197,
"calib/gap": 0.021494694865769493,
"calib/mean_conf": 0.9522267759562841,
"calib/mu_c": 0.960419426048565,
"calib/mu_w": 0.9389247311827955,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3471721311475411,
"calib/std_conf": 0.1373709214728154,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9022589285714286,
"calib/step_q_c_n": 1120.0,
"calib/step_q_gap": -0.0017322022489705269,
"calib/step_q_w": 0.9039911308203992,
"calib/step_q_w_n": 902.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 751.546875,
"completions/mean_terminated_length": 772.6746826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.0864,
"grad_norm": 0.02237076126039028,
"kl": 0.1680908203125,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.0794,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.019753076136112213,
"mask/share_reasoning": 0.8459489941596985,
"mask/share_step_conf": 0.1069541722536087,
"num_tokens": 24758205.0,
"reward": 0.5961042046546936,
"reward_std": 0.4187864065170288,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6139017343521118,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.2712753117084503,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6603360176086426,
"adv/mean_abs_reasoning": 0.5264520049095154,
"adv/mean_abs_step_conf": 0.7618018984794617,
"adv/ratio_final_to_reasoning": 1.2543138053432594,
"adv/ratio_step_to_reasoning": 1.4470490973064056,
"adv/std_final_conf": 0.8467787504196167,
"adv/std_reasoning": 0.7754591107368469,
"adv/std_step_conf": 0.9360379576683044,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.527682119205298,
"calib/avg_num_step_conf": 8.07421875,
"calib/ece": 0.3805843293492698,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9561752988047809,
"calib/gap": 0.017545253863134658,
"calib/mean_conf": 0.9545551128818063,
"calib/mu_c": 0.9615452538631346,
"calib/mu_w": 0.944,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36677290836653403,
"calib/std_conf": 0.16185023037746563,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9057720891824939,
"calib/step_q_c_n": 1211.0,
"calib/step_q_gap": -0.002510621097880028,
"calib/step_q_w": 0.9082827102803739,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 700.9140625,
"completions/mean_terminated_length": 706.4330444335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.01994318887591362,
"kl": 0.177215576171875,
"learning_rate": 3.277777777777778e-06,
"loss": 0.016,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.021094955503940582,
"mask/share_reasoning": 0.8498541116714478,
"mask/share_step_conf": 0.12123844772577286,
"num_tokens": 25043191.0,
"reward": 0.5797907710075378,
"reward_std": 0.3987249433994293,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6056525707244873,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.241428941488266,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.5970791578292847,
"adv/mean_abs_reasoning": 0.42100244760513306,
"adv/mean_abs_step_conf": 0.7423392534255981,
"adv/ratio_final_to_reasoning": 1.418232034577855,
"adv/ratio_step_to_reasoning": 1.7632658851471894,
"adv/std_final_conf": 0.8194170594215393,
"adv/std_reasoning": 0.7205320000648499,
"adv/std_step_conf": 0.9362738132476807,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5783152682992104,
"calib/avg_num_step_conf": 7.3359375,
"calib/ece": 0.39963562753036447,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9433198380566802,
"calib/gap": 0.04008764886926253,
"calib/mean_conf": 0.9599595141700406,
"calib/mu_c": 0.9771631205673759,
"calib/mu_w": 0.9370754716981133,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3943724696356276,
"calib/std_conf": 0.1428866598083284,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9062692307692307,
"calib/step_q_c_n": 1040.0,
"calib/step_q_gap": 0.00563677253534034,
"calib/step_q_w": 0.9006324582338904,
"calib/step_q_w_n": 838.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 819.76171875,
"completions/mean_terminated_length": 829.4822387695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.020195307210087776,
"kl": 0.155517578125,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0086,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01880960538983345,
"mask/share_reasoning": 0.8670693635940552,
"mask/share_step_conf": 0.10240228474140167,
"num_tokens": 25360314.0,
"reward": 0.5373420715332031,
"reward_std": 0.3253489136695862,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5816448926925659,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.1899142861366272,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.6476809978485107,
"adv/mean_abs_reasoning": 0.5205007791519165,
"adv/mean_abs_step_conf": 0.7634508013725281,
"adv/ratio_final_to_reasoning": 1.2443420332699917,
"adv/ratio_step_to_reasoning": 1.4667620721268944,
"adv/std_final_conf": 0.8201258182525635,
"adv/std_reasoning": 0.7755391597747803,
"adv/std_step_conf": 0.9361568093299866,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5499315068493151,
"calib/avg_num_step_conf": 7.78125,
"calib/ece": 0.37711382113821135,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9715447154471545,
"calib/gap": 0.04365616438356179,
"calib/mean_conf": 0.9706097560975612,
"calib/mu_c": 0.9883561643835618,
"calib/mu_w": 0.9447,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37711382113821135,
"calib/std_conf": 0.11152688502634846,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9081099656357388,
"calib/step_q_c_n": 1164.0,
"calib/step_q_gap": -0.001467329050251709,
"calib/step_q_w": 0.9095772946859905,
"calib/step_q_w_n": 828.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 742.08984375,
"completions/mean_terminated_length": 747.9330444335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.0896,
"grad_norm": 0.020287124440073967,
"kl": 0.1703643798828125,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0337,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.020629458129405975,
"mask/share_reasoning": 0.8547236323356628,
"mask/share_step_conf": 0.11683438718318939,
"num_tokens": 25656209.0,
"reward": 0.5711010694503784,
"reward_std": 0.41056394577026367,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6007331609725952,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.2344377040863037,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6982238292694092,
"adv/mean_abs_reasoning": 0.469634473323822,
"adv/mean_abs_step_conf": 0.7610514163970947,
"adv/ratio_final_to_reasoning": 1.4867388765729947,
"adv/ratio_step_to_reasoning": 1.6205186365702229,
"adv/std_final_conf": 0.8545514345169067,
"adv/std_reasoning": 0.7393996119499207,
"adv/std_step_conf": 0.9361210465431213,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5861647276573242,
"calib/avg_num_step_conf": 7.80078125,
"calib/ece": 0.4820731707317073,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9634146341463414,
"calib/gap": 0.020375462718138504,
"calib/mean_conf": 0.9780081300813009,
"calib/mu_c": 0.9882786885245902,
"calib/mu_w": 0.9679032258064517,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4820731707317073,
"calib/std_conf": 0.06663588670766235,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9076089266737514,
"calib/step_q_c_n": 941.0,
"calib/step_q_gap": -0.002088043023218411,
"calib/step_q_w": 0.9096969696969698,
"calib/step_q_w_n": 1056.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2503.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 774.8515625,
"completions/mean_terminated_length": 790.286865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 365.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.02755478024482727,
"kl": 0.1668548583984375,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0236,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019106771796941757,
"mask/share_reasoning": 0.8527930378913879,
"mask/share_step_conf": 0.1085689514875412,
"num_tokens": 25962395.0,
"reward": 0.4849882423877716,
"reward_std": 0.3687390685081482,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.49901872873306274,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.1842389702796936,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.6831027269363403,
"adv/mean_abs_reasoning": 0.44067060947418213,
"adv/mean_abs_step_conf": 0.7617740035057068,
"adv/ratio_final_to_reasoning": 1.5501436044292438,
"adv/ratio_step_to_reasoning": 1.7286698661720878,
"adv/std_final_conf": 0.8571823835372925,
"adv/std_reasoning": 0.7014427781105042,
"adv/std_step_conf": 0.9361329078674316,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.548220472440945,
"calib/avg_num_step_conf": 7.8046875,
"calib/ece": 0.4785714285714287,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9722222222222222,
"calib/gap": 0.010436535433070881,
"calib/mean_conf": 0.9825396825396826,
"calib/mu_c": 0.9877165354330708,
"calib/mu_w": 0.9772799999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4785714285714287,
"calib/std_conf": 0.06547065873698518,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9088988095238096,
"calib/step_q_c_n": 1008.0,
"calib/step_q_gap": 0.003888708513708594,
"calib/step_q_w": 0.905010101010101,
"calib/step_q_w_n": 990.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2881.0,
"completions/max_terminated_length": 2881.0,
"completions/mean_length": 766.34375,
"completions/mean_terminated_length": 775.4308471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 304.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.024390293285250664,
"kl": 0.162445068359375,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0033,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019972484558820724,
"mask/share_reasoning": 0.8537373542785645,
"mask/share_step_conf": 0.11457139253616333,
"num_tokens": 26264091.0,
"reward": 0.46634215116500854,
"reward_std": 0.3684176504611969,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5136839747428894,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.12368777394294739,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.6265145540237427,
"adv/mean_abs_reasoning": 0.4753897190093994,
"adv/mean_abs_step_conf": 0.7652829885482788,
"adv/ratio_final_to_reasoning": 1.3178967255102865,
"adv/ratio_step_to_reasoning": 1.609801301851771,
"adv/std_final_conf": 0.8234379291534424,
"adv/std_reasoning": 0.7394675612449646,
"adv/std_step_conf": 0.936168909072876,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5356804172593645,
"calib/avg_num_step_conf": 7.8515625,
"calib/ece": 0.28661224489795933,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9836734693877551,
"calib/gap": 0.008659712343923043,
"calib/mean_conf": 0.9825306122448979,
"calib/mu_c": 0.9851461988304095,
"calib/mu_w": 0.9764864864864865,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.285591836734694,
"calib/std_conf": 0.05303949319517171,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9073399390243905,
"calib/step_q_c_n": 1312.0,
"calib/step_q_gap": -0.008734559542944886,
"calib/step_q_w": 0.9160744985673354,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 695.91796875,
"completions/mean_terminated_length": 721.2753295898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 355.0,
"epoch": 0.0928,
"grad_norm": 0.016792908310890198,
"kl": 0.1767578125,
"learning_rate": 3.138888888888889e-06,
"loss": -0.1016,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02021612599492073,
"mask/share_reasoning": 0.8318778276443481,
"mask/share_step_conf": 0.11274976283311844,
"num_tokens": 26547742.0,
"reward": 0.6892440319061279,
"reward_std": 0.40235090255737305,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6785781383514404,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.374909907579422,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.5426766872406006,
"adv/mean_abs_reasoning": 0.371280699968338,
"adv/mean_abs_step_conf": 0.7295685410499573,
"adv/ratio_final_to_reasoning": 1.461634518807142,
"adv/ratio_step_to_reasoning": 1.965005294140453,
"adv/std_final_conf": 0.8003331422805786,
"adv/std_reasoning": 0.7011435031890869,
"adv/std_step_conf": 0.9352799654006958,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5501409443269908,
"calib/avg_num_step_conf": 7.203125,
"calib/ece": 0.3319521912350598,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.0030380549682875735,
"calib/mean_conf": 0.9893227091633466,
"calib/mu_c": 0.9903636363636362,
"calib/mu_w": 0.9873255813953487,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3319521912350598,
"calib/std_conf": 0.010929848848345757,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9066103059581321,
"calib/step_q_c_n": 1242.0,
"calib/step_q_gap": 0.005314624895009112,
"calib/step_q_w": 0.901295681063123,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 754.734375,
"completions/mean_terminated_length": 760.6771850585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.014357146807014942,
"kl": 0.1505279541015625,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0539,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019829383119940758,
"mask/share_reasoning": 0.8680585026741028,
"mask/share_step_conf": 0.1042996495962143,
"num_tokens": 26850802.0,
"reward": 0.6862862706184387,
"reward_std": 0.31181150674819946,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6488343477249146,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.3995193541049957,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.5844972133636475,
"adv/mean_abs_reasoning": 0.4211689233779907,
"adv/mean_abs_step_conf": 0.7886731624603271,
"adv/ratio_final_to_reasoning": 1.3877975817296302,
"adv/ratio_step_to_reasoning": 1.8725815668800156,
"adv/std_final_conf": 0.7998574376106262,
"adv/std_reasoning": 0.7013557553291321,
"adv/std_step_conf": 0.9361746907234192,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5437568069703377,
"calib/avg_num_step_conf": 7.62109375,
"calib/ece": 0.46563999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.976,
"calib/gap": 0.014389775129732829,
"calib/mean_conf": 0.9816400000000001,
"calib/mu_c": 0.9886046511627907,
"calib/mu_w": 0.9742148760330579,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46563999999999994,
"calib/std_conf": 0.06436855132749222,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9093837972166999,
"calib/step_q_c_n": 1006.0,
"calib/step_q_gap": 0.010695966528869283,
"calib/step_q_w": 0.8986878306878306,
"calib/step_q_w_n": 945.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 836.5859375,
"completions/mean_terminated_length": 839.86669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.020895086228847504,
"kl": 0.14752197265625,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.0347,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018971944227814674,
"mask/share_reasoning": 0.8709502220153809,
"mask/share_step_conf": 0.10617159307003021,
"num_tokens": 27173856.0,
"reward": 0.5042704939842224,
"reward_std": 0.335740864276886,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5199999809265137,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.19322849810123444,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.6049848794937134,
"adv/mean_abs_reasoning": 0.4883537292480469,
"adv/mean_abs_step_conf": 0.7501746416091919,
"adv/ratio_final_to_reasoning": 1.2388251450956498,
"adv/ratio_step_to_reasoning": 1.5361296467711825,
"adv/std_final_conf": 0.8281879425048828,
"adv/std_reasoning": 0.7753020524978638,
"adv/std_step_conf": 0.9353448152542114,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5343933875035025,
"calib/avg_num_step_conf": 7.7421875,
"calib/ece": 0.33662698412698394,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": -0.0034617539927148666,
"calib/mean_conf": 0.9802777777777779,
"calib/mu_c": 0.9790963855421685,
"calib/mu_w": 0.9825581395348834,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32908730158730143,
"calib/std_conf": 0.08621863486646218,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9016487730061351,
"calib/step_q_c_n": 1304.0,
"calib/step_q_gap": -0.0054898700617114216,
"calib/step_q_w": 0.9071386430678465,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2363.0,
"completions/max_terminated_length": 2363.0,
"completions/mean_length": 760.1171875,
"completions/mean_terminated_length": 766.1023559570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 306.0,
"epoch": 0.096,
"grad_norm": 0.020056363195180893,
"kl": 0.157623291015625,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0471,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02005939930677414,
"mask/share_reasoning": 0.8588818311691284,
"mask/share_step_conf": 0.11324631422758102,
"num_tokens": 27471766.0,
"reward": 0.659142255783081,
"reward_std": 0.35710757970809937,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6485495567321777,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3439536690711975,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.6580387353897095,
"adv/mean_abs_reasoning": 0.5622403621673584,
"adv/mean_abs_step_conf": 0.7587998509407043,
"adv/ratio_final_to_reasoning": 1.1703868659536674,
"adv/ratio_step_to_reasoning": 1.349600459162406,
"adv/std_final_conf": 0.8453028202056885,
"adv/std_reasoning": 0.8099509477615356,
"adv/std_step_conf": 0.9357338547706604,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5231911599099099,
"calib/avg_num_step_conf": 7.49609375,
"calib/ece": 0.37295081967213106,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9795081967213115,
"calib/gap": 0.02687218468468444,
"calib/mean_conf": 0.9782786885245903,
"calib/mu_c": 0.9888513513513512,
"calib/mu_w": 0.9619791666666667,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3723360655737704,
"calib/std_conf": 0.08907716881916157,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9064558139534884,
"calib/step_q_c_n": 1075.0,
"calib/step_q_gap": -0.0003095888901135657,
"calib/step_q_w": 0.906765402843602,
"calib/step_q_w_n": 844.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 837.46875,
"completions/mean_terminated_length": 850.761962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 453.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.013803146779537201,
"kl": 0.146087646484375,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0097,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01792112924158573,
"mask/share_reasoning": 0.867091178894043,
"mask/share_step_conf": 0.09936273097991943,
"num_tokens": 27793870.0,
"reward": 0.5871576070785522,
"reward_std": 0.4421505331993103,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5986281037330627,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.26943719387054443,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.5850780010223389,
"adv/mean_abs_reasoning": 0.413005530834198,
"adv/mean_abs_step_conf": 0.7297500371932983,
"adv/ratio_final_to_reasoning": 1.4166347841409894,
"adv/ratio_step_to_reasoning": 1.766925580195821,
"adv/std_final_conf": 0.8069612979888916,
"adv/std_reasoning": 0.7204309105873108,
"adv/std_step_conf": 0.9356918334960938,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5852906050955414,
"calib/avg_num_step_conf": 8.05078125,
"calib/ece": 0.3537351778656127,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9683794466403162,
"calib/gap": 0.03983114384288711,
"calib/mean_conf": 0.973102766798419,
"calib/mu_c": 0.988216560509554,
"calib/mu_w": 0.9483854166666669,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35314229249011864,
"calib/std_conf": 0.1035456273768553,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9083766233766233,
"calib/step_q_c_n": 1232.0,
"calib/step_q_gap": 0.0006444158977331194,
"calib/step_q_w": 0.9077322074788902,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2309.0,
"completions/max_terminated_length": 2309.0,
"completions/mean_length": 708.21875,
"completions/mean_terminated_length": 713.7952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.016659297049045563,
"kl": 0.17535400390625,
"learning_rate": 3e-06,
"loss": 0.0304,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021326087415218353,
"mask/share_reasoning": 0.8476418852806091,
"mask/share_step_conf": 0.12321953475475311,
"num_tokens": 28081894.0,
"reward": 0.6601039171218872,
"reward_std": 0.32772600650787354,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6406811475753784,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3592141270637512,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.6216708421707153,
"adv/mean_abs_reasoning": 0.5425342321395874,
"adv/mean_abs_step_conf": 0.7974840402603149,
"adv/ratio_final_to_reasoning": 1.1458647313719497,
"adv/ratio_step_to_reasoning": 1.469923910819939,
"adv/std_final_conf": 0.8077294230461121,
"adv/std_reasoning": 0.7755074501037598,
"adv/std_step_conf": 0.9356343746185303,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5152945870461498,
"calib/avg_num_step_conf": 7.70703125,
"calib/ece": 0.39759036144578325,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.01350113046947743,
"calib/mean_conf": 0.9839357429718877,
"calib/mu_c": 0.9895205479452055,
"calib/mu_w": 0.9760194174757281,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39759036144578325,
"calib/std_conf": 0.06190233828775913,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9072117962466488,
"calib/step_q_c_n": 1119.0,
"calib/step_q_gap": 0.007246925052269537,
"calib/step_q_w": 0.8999648711943793,
"calib/step_q_w_n": 854.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2959.0,
"completions/max_terminated_length": 2959.0,
"completions/mean_length": 769.86328125,
"completions/mean_terminated_length": 775.9251708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.0992,
"grad_norm": 0.013393659144639969,
"kl": 0.1565093994140625,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.026,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019578615203499794,
"mask/share_reasoning": 0.8596739768981934,
"mask/share_step_conf": 0.1129348874092102,
"num_tokens": 28384755.0,
"reward": 0.5730220079421997,
"reward_std": 0.4286647140979767,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.585631251335144,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.251818984746933,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.631345808506012,
"adv/mean_abs_reasoning": 0.504591703414917,
"adv/mean_abs_step_conf": 0.7707089781761169,
"adv/ratio_final_to_reasoning": 1.2512013262074333,
"adv/ratio_step_to_reasoning": 1.5273913006500155,
"adv/std_final_conf": 0.8334376811981201,
"adv/std_reasoning": 0.7754099369049072,
"adv/std_step_conf": 0.9354292750358582,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5365646258503401,
"calib/avg_num_step_conf": 7.4921875,
"calib/ece": 0.41232653061224495,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9795918367346939,
"calib/gap": 0.013904761904761642,
"calib/mean_conf": 0.9837551020408163,
"calib/mu_c": 0.9897142857142855,
"calib/mu_w": 0.9758095238095239,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.41232653061224495,
"calib/std_conf": 0.057645833030722286,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9093100189035918,
"calib/step_q_c_n": 1058.0,
"calib/step_q_gap": 0.011997228205917199,
"calib/step_q_w": 0.8973127906976746,
"calib/step_q_w_n": 860.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 762.890625,
"completions/mean_terminated_length": 775.0000610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.026255443692207336,
"kl": 0.170745849609375,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0065,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019794750958681107,
"mask/share_reasoning": 0.8554838299751282,
"mask/share_step_conf": 0.10909644514322281,
"num_tokens": 28688735.0,
"reward": 0.5537281036376953,
"reward_std": 0.37324994802474976,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5632859468460083,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.24417030811309814,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5990731120109558,
"adv/mean_abs_reasoning": 0.49023786187171936,
"adv/mean_abs_step_conf": 0.7478466033935547,
"adv/ratio_final_to_reasoning": 1.2220049869744973,
"adv/ratio_step_to_reasoning": 1.5254770419777244,
"adv/std_final_conf": 0.8059893250465393,
"adv/std_reasoning": 0.7575047016143799,
"adv/std_step_conf": 0.9361989498138428,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.534026402640264,
"calib/avg_num_step_conf": 7.7265625,
"calib/ece": 0.37441832669322705,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9760956175298805,
"calib/gap": 0.043321386138613915,
"calib/mean_conf": 0.9720278884462153,
"calib/mu_c": 0.9894599999999999,
"calib/mu_w": 0.946138613861386,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37441832669322705,
"calib/std_conf": 0.11924745446982661,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9066677908937605,
"calib/step_q_c_n": 1186.0,
"calib/step_q_gap": 0.006187992913962437,
"calib/step_q_w": 0.900479797979798,
"calib/step_q_w_n": 792.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2600.0,
"completions/max_terminated_length": 2600.0,
"completions/mean_length": 769.3359375,
"completions/mean_terminated_length": 775.3936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.012575649656355381,
"kl": 0.1600494384765625,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0173,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01988060027360916,
"mask/share_reasoning": 0.8578656911849976,
"mask/share_step_conf": 0.11444119364023209,
"num_tokens": 28991813.0,
"reward": 0.5770303010940552,
"reward_std": 0.41374671459198,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6137281060218811,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.2270512878894806,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.594207763671875,
"adv/mean_abs_reasoning": 0.4591464400291443,
"adv/mean_abs_step_conf": 0.7937597036361694,
"adv/ratio_final_to_reasoning": 1.2941574013601362,
"adv/ratio_step_to_reasoning": 1.7287724229894619,
"adv/std_final_conf": 0.8037899732589722,
"adv/std_reasoning": 0.7206980586051941,
"adv/std_step_conf": 0.9362131953239441,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4961742424242424,
"calib/avg_num_step_conf": 7.37109375,
"calib/ece": 0.2850199203187249,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9840637450199203,
"calib/gap": 0.013834848484848616,
"calib/mean_conf": 0.9814342629482073,
"calib/mu_c": 0.9855681818181818,
"calib/mu_w": 0.9717333333333332,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28262948207171296,
"calib/std_conf": 0.06363127301674591,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.906852986217458,
"calib/step_q_c_n": 1306.0,
"calib/step_q_gap": 0.005803072275977872,
"calib/step_q_w": 0.9010499139414802,
"calib/step_q_w_n": 581.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2317.0,
"completions/max_terminated_length": 2317.0,
"completions/mean_length": 717.26171875,
"completions/mean_terminated_length": 722.909423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.1024,
"grad_norm": 0.02010061964392662,
"kl": 0.1659393310546875,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020429600030183792,
"mask/share_reasoning": 0.8600814342498779,
"mask/share_step_conf": 0.11167645454406738,
"num_tokens": 29281248.0,
"reward": 0.7246764898300171,
"reward_std": 0.400970995426178,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.6997547149658203,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.4160044193267822,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.6796474456787109,
"adv/mean_abs_reasoning": 0.5447673797607422,
"adv/mean_abs_step_conf": 0.7568874359130859,
"adv/ratio_final_to_reasoning": 1.2475920382332861,
"adv/ratio_step_to_reasoning": 1.3893773086147436,
"adv/std_final_conf": 0.8596211671829224,
"adv/std_reasoning": 0.792809247970581,
"adv/std_step_conf": 0.9363095760345459,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5188064859117492,
"calib/avg_num_step_conf": 8.1484375,
"calib/ece": 0.38115537848605596,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.0034456406166932796,
"calib/mean_conf": 0.9867330677290838,
"calib/mu_c": 0.9880921052631578,
"calib/mu_w": 0.9846464646464645,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38115537848605596,
"calib/std_conf": 0.012452477879124065,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9106488240064882,
"calib/step_q_c_n": 1233.0,
"calib/step_q_gap": 0.003204509821259527,
"calib/step_q_w": 0.9074443141852286,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2847.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 768.609375,
"completions/mean_terminated_length": 768.609375,
"completions/min_length": 313.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.02059132792055607,
"kl": 0.157989501953125,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019798683002591133,
"mask/share_reasoning": 0.8629213571548462,
"mask/share_step_conf": 0.1172800213098526,
"num_tokens": 29583084.0,
"reward": 0.5847292542457581,
"reward_std": 0.42039793729782104,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6053003668785095,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.24931436777114868,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.5588270425796509,
"adv/mean_abs_reasoning": 0.44539546966552734,
"adv/mean_abs_step_conf": 0.7786312699317932,
"adv/ratio_final_to_reasoning": 1.2546760814592608,
"adv/ratio_step_to_reasoning": 1.7481795908622768,
"adv/std_final_conf": 0.7846071720123291,
"adv/std_reasoning": 0.7393063902854919,
"adv/std_step_conf": 0.9350544214248657,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5123342175066312,
"calib/avg_num_step_conf": 7.82421875,
"calib/ece": 0.4010682730923695,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9839357429718876,
"calib/gap": 0.012817904509283773,
"calib/mean_conf": 0.9817911646586346,
"calib/mu_c": 0.9871448275862069,
"calib/mu_w": 0.9743269230769231,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40026506024096387,
"calib/std_conf": 0.06544126219319697,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9075486725663718,
"calib/step_q_c_n": 1130.0,
"calib/step_q_gap": -0.001924408762379648,
"calib/step_q_w": 0.9094730813287515,
"calib/step_q_w_n": 873.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 788.95703125,
"completions/mean_terminated_length": 788.95703125,
"completions/min_length": 349.0,
"completions/min_terminated_length": 349.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.014475611969828606,
"kl": 0.1655731201171875,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.1126,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01985691487789154,
"mask/share_reasoning": 0.8670278787612915,
"mask/share_step_conf": 0.11311522126197815,
"num_tokens": 29891241.0,
"reward": 0.5559430122375488,
"reward_std": 0.37032419443130493,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5827776193618774,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.22129589319229126,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6716063618659973,
"adv/mean_abs_reasoning": 0.5517687797546387,
"adv/mean_abs_step_conf": 0.7963594198226929,
"adv/ratio_final_to_reasoning": 1.2171880441743155,
"adv/ratio_step_to_reasoning": 1.44328466749572,
"adv/std_final_conf": 0.8631746768951416,
"adv/std_reasoning": 0.7929278016090393,
"adv/std_step_conf": 0.9363625645637512,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5258838383838383,
"calib/avg_num_step_conf": 8.015625,
"calib/ece": 0.5740329218106998,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9753086419753086,
"calib/gap": 0.010372474747474714,
"calib/mean_conf": 0.9752674897119343,
"calib/mu_c": 0.9814141414141415,
"calib/mu_w": 0.9710416666666668,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5709465020576133,
"calib/std_conf": 0.09517273905879559,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9105185185185185,
"calib/step_q_c_n": 810.0,
"calib/step_q_gap": 0.016436392914653708,
"calib/step_q_w": 0.8940821256038648,
"calib/step_q_w_n": 1242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3057.0,
"completions/max_terminated_length": 3057.0,
"completions/mean_length": 832.1796875,
"completions/mean_terminated_length": 848.7570190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.1056,
"grad_norm": 0.0244298093020916,
"kl": 0.1494598388671875,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0562,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.017815835773944855,
"mask/share_reasoning": 0.8580272197723389,
"mask/share_step_conf": 0.10462567210197449,
"num_tokens": 30210079.0,
"reward": 0.2601035237312317,
"reward_std": 0.4317253828048706,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.41011834144592285,
"rewards/format_reward_step": 0.94921875,
"rewards/step_correlation_reward": -0.15788006782531738,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.6238076686859131,
"adv/mean_abs_reasoning": 0.45149117708206177,
"adv/mean_abs_step_conf": 0.7373534440994263,
"adv/ratio_final_to_reasoning": 1.3816608172002696,
"adv/ratio_step_to_reasoning": 1.6331513914953137,
"adv/std_final_conf": 0.8372292518615723,
"adv/std_reasoning": 0.7392777800559998,
"adv/std_step_conf": 0.9361663460731506,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5089066339066339,
"calib/avg_num_step_conf": 7.625,
"calib/ece": 0.38210526315789495,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9595141700404858,
"calib/gap": 0.022002457002456954,
"calib/mean_conf": 0.965910931174089,
"calib/mu_c": 0.9747297297297297,
"calib/mu_w": 0.9527272727272728,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3744129554655872,
"calib/std_conf": 0.12706226024148723,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9030574912891987,
"calib/step_q_c_n": 1148.0,
"calib/step_q_gap": -0.001096737566522843,
"calib/step_q_w": 0.9041542288557215,
"calib/step_q_w_n": 804.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2298.0,
"completions/max_terminated_length": 2298.0,
"completions/mean_length": 774.9375,
"completions/mean_terminated_length": 787.2381591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.019371310248970985,
"kl": 0.1641845703125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0149,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019257165491580963,
"mask/share_reasoning": 0.8558045029640198,
"mask/share_step_conf": 0.10931334644556046,
"num_tokens": 30515871.0,
"reward": 0.5841464400291443,
"reward_std": 0.3418727517127991,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.597989022731781,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.2601475417613983,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.6889234781265259,
"adv/mean_abs_reasoning": 0.6015197038650513,
"adv/mean_abs_step_conf": 0.7753009796142578,
"adv/ratio_final_to_reasoning": 1.1453049230139323,
"adv/ratio_step_to_reasoning": 1.2889037127671443,
"adv/std_final_conf": 0.8938189744949341,
"adv/std_reasoning": 0.859066903591156,
"adv/std_step_conf": 0.9366356730461121,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5376422934562469,
"calib/avg_num_step_conf": 8.26953125,
"calib/ece": 0.5155416666666667,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9708333333333333,
"calib/gap": 0.012023884349465663,
"calib/mean_conf": 0.9780416666666667,
"calib/mu_c": 0.9845045045045044,
"calib/mu_w": 0.9724806201550388,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5155416666666667,
"calib/std_conf": 0.07245629669363152,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.891039387308534,
"calib/step_q_c_n": 914.0,
"calib/step_q_gap": -0.01196144394666121,
"calib/step_q_w": 0.9030008312551953,
"calib/step_q_w_n": 1203.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3031.0,
"completions/max_terminated_length": 3031.0,
"completions/mean_length": 840.95703125,
"completions/mean_terminated_length": 861.1400146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.01625613309442997,
"kl": 0.1616668701171875,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0316,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.01756630465388298,
"mask/share_reasoning": 0.8519110083580017,
"mask/share_step_conf": 0.1070852130651474,
"num_tokens": 30838148.0,
"reward": 0.41306352615356445,
"reward_std": 0.47633716464042664,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.45595428347587585,
"rewards/format_reward_step": 0.9375,
"rewards/step_correlation_reward": 0.09517276287078857,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.564067542552948,
"adv/mean_abs_reasoning": 0.3646267056465149,
"adv/mean_abs_step_conf": 0.7716248631477356,
"adv/ratio_final_to_reasoning": 1.546972653999128,
"adv/ratio_step_to_reasoning": 2.116205015152627,
"adv/std_final_conf": 0.7669880986213684,
"adv/std_reasoning": 0.640241801738739,
"adv/std_step_conf": 0.9355287551879883,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5437934458788481,
"calib/avg_num_step_conf": 8.2890625,
"calib/ece": 0.3619685039370081,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9763779527559056,
"calib/gap": 0.0031658391261171204,
"calib/mean_conf": 0.9769291338582676,
"calib/mu_c": 0.9781132075471697,
"calib/mu_w": 0.9749473684210526,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.356456692913386,
"calib/std_conf": 0.07518493536306997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9006338553318419,
"calib/step_q_c_n": 1341.0,
"calib/step_q_gap": -0.005704172837172283,
"calib/step_q_w": 0.9063380281690142,
"calib/step_q_w_n": 781.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1564.0,
"completions/max_terminated_length": 1564.0,
"completions/mean_length": 710.234375,
"completions/mean_terminated_length": 713.0196533203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.1088,
"grad_norm": 0.013416235335171223,
"kl": 0.192138671875,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0266,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.021066762506961823,
"mask/share_reasoning": 0.8479097485542297,
"mask/share_step_conf": 0.12711723148822784,
"num_tokens": 31126664.0,
"reward": 0.6091418862342834,
"reward_std": 0.3119138479232788,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.633550763130188,
"rewards/format_reward_step": 0.9921875,
"rewards/step_correlation_reward": 0.2620766758918762,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.6057738065719604,
"adv/mean_abs_reasoning": 0.460005521774292,
"adv/mean_abs_step_conf": 0.7314463257789612,
"adv/ratio_final_to_reasoning": 1.3168837718195732,
"adv/ratio_step_to_reasoning": 1.5900816211024862,
"adv/std_final_conf": 0.790691077709198,
"adv/std_reasoning": 0.7393838763237,
"adv/std_step_conf": 0.9350913166999817,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5302175956047134,
"calib/avg_num_step_conf": 7.69140625,
"calib/ece": 0.33369918699187007,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.975609756097561,
"calib/gap": 0.013934070700499235,
"calib/mean_conf": 0.9800406504065042,
"calib/mu_c": 0.9849685534591196,
"calib/mu_w": 0.9710344827586204,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33369918699187007,
"calib/std_conf": 0.0657322603815016,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9058534621578099,
"calib/step_q_c_n": 1242.0,
"calib/step_q_gap": -0.001973222849067535,
"calib/step_q_w": 0.9078266850068775,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2569.0,
"completions/max_terminated_length": 2569.0,
"completions/mean_length": 804.5546875,
"completions/mean_terminated_length": 820.5817260742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.019397255033254623,
"kl": 0.150115966796875,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0785,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01852298527956009,
"mask/share_reasoning": 0.8557056188583374,
"mask/share_step_conf": 0.1062401756644249,
"num_tokens": 31437182.0,
"reward": 0.6300480365753174,
"reward_std": 0.3218870460987091,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6362464427947998,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.30666205286979675,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.682067334651947,
"adv/mean_abs_reasoning": 0.4738949239253998,
"adv/mean_abs_step_conf": 0.7475656867027283,
"adv/ratio_final_to_reasoning": 1.4392796804029866,
"adv/ratio_step_to_reasoning": 1.5774924966708646,
"adv/std_final_conf": 0.8657509684562683,
"adv/std_reasoning": 0.7575639486312866,
"adv/std_step_conf": 0.9358437061309814,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5349270612428507,
"calib/avg_num_step_conf": 8.19140625,
"calib/ece": 0.43314,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.948,
"calib/gap": 0.03745549771865553,
"calib/mean_conf": 0.9651400000000001,
"calib/mu_c": 0.9826691729323308,
"calib/mu_w": 0.9452136752136753,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43314,
"calib/std_conf": 0.11107871263207905,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9072242990654206,
"calib/step_q_c_n": 1070.0,
"calib/step_q_gap": 0.0011580867966766162,
"calib/step_q_w": 0.906066212268744,
"calib/step_q_w_n": 1027.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2043.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 779.765625,
"completions/mean_terminated_length": 779.765625,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.02324623055756092,
"kl": 0.1611480712890625,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0021,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019226912409067154,
"mask/share_reasoning": 0.8651635646820068,
"mask/share_step_conf": 0.1156095415353775,
"num_tokens": 31743482.0,
"reward": 0.5363671779632568,
"reward_std": 0.3894505500793457,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5563733577728271,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.2171422243118286,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.6646937727928162,
"adv/mean_abs_reasoning": 0.5343684554100037,
"adv/mean_abs_step_conf": 0.7638580799102783,
"adv/ratio_final_to_reasoning": 1.2438866217932307,
"adv/ratio_step_to_reasoning": 1.429459527741387,
"adv/std_final_conf": 0.8703509569168091,
"adv/std_reasoning": 0.7928860783576965,
"adv/std_step_conf": 0.9356840252876282,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6223418306696605,
"calib/avg_num_step_conf": 8.79296875,
"calib/ece": 0.4010121457489878,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9109311740890689,
"calib/gap": 0.09539426760005276,
"calib/mean_conf": 0.9435222672064778,
"calib/mu_c": 0.9871641791044776,
"calib/mu_w": 0.8917699115044249,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4010121457489878,
"calib/std_conf": 0.1719056754300515,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9093078324225865,
"calib/step_q_c_n": 1098.0,
"calib/step_q_gap": 0.010279211433861457,
"calib/step_q_w": 0.899028620988725,
"calib/step_q_w_n": 1153.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2989.0,
"completions/max_terminated_length": 2989.0,
"completions/mean_length": 807.8515625,
"completions/mean_terminated_length": 820.6746215820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.112,
"grad_norm": 0.021434539929032326,
"kl": 0.152801513671875,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0398,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018452614545822144,
"mask/share_reasoning": 0.8555814027786255,
"mask/share_step_conf": 0.11034099757671356,
"num_tokens": 32056052.0,
"reward": 0.5344028472900391,
"reward_std": 0.4172634780406952,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.587394118309021,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.1829739362001419,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6361554861068726,
"adv/mean_abs_reasoning": 0.4690241813659668,
"adv/mean_abs_step_conf": 0.769987940788269,
"adv/ratio_final_to_reasoning": 1.3563383539291287,
"adv/ratio_step_to_reasoning": 1.641680688074094,
"adv/std_final_conf": 0.8325013518333435,
"adv/std_reasoning": 0.7576159238815308,
"adv/std_step_conf": 0.9357642531394958,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5327891156462585,
"calib/avg_num_step_conf": 7.98046875,
"calib/ece": 0.39783673469387776,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9551020408163265,
"calib/gap": 0.02766666666666684,
"calib/mean_conf": 0.9660000000000001,
"calib/mu_c": 0.9778571428571431,
"calib/mu_w": 0.9501904761904763,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39620408163265325,
"calib/std_conf": 0.10906429483640953,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9052833638025595,
"calib/step_q_c_n": 1094.0,
"calib/step_q_gap": -0.0038314939424353156,
"calib/step_q_w": 0.9091148577449948,
"calib/step_q_w_n": 949.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2027.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 743.40234375,
"completions/mean_terminated_length": 758.211181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.0160527266561985,
"kl": 0.16754150390625,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0214,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01931002363562584,
"mask/share_reasoning": 0.850881814956665,
"mask/share_step_conf": 0.1102769672870636,
"num_tokens": 32350947.0,
"reward": 0.5429900288581848,
"reward_std": 0.348480224609375,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.575013279914856,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.21018551290035248,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6448113322257996,
"adv/mean_abs_reasoning": 0.5004734992980957,
"adv/mean_abs_step_conf": 0.7651152014732361,
"adv/ratio_final_to_reasoning": 1.288402549046323,
"adv/ratio_step_to_reasoning": 1.5287826479250055,
"adv/std_final_conf": 0.8311536908149719,
"adv/std_reasoning": 0.7575410604476929,
"adv/std_step_conf": 0.9356609582901001,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5640840220385676,
"calib/avg_num_step_conf": 8.85546875,
"calib/ece": 0.3256521739130436,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9683794466403162,
"calib/gap": 0.028325757575757615,
"calib/mean_conf": 0.9742687747035573,
"calib/mu_c": 0.9841212121212123,
"calib/mu_w": 0.9557954545454547,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32387351778656137,
"calib/std_conf": 0.07904043402431196,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9090238611713665,
"calib/step_q_c_n": 1383.0,
"calib/step_q_gap": -0.011258944258497938,
"calib/step_q_w": 0.9202828054298644,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1900.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 729.65234375,
"completions/mean_terminated_length": 738.3043823242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.013503472320735455,
"kl": 0.173858642578125,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0111,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02020452544093132,
"mask/share_reasoning": 0.840317964553833,
"mask/share_step_conf": 0.12775877118110657,
"num_tokens": 32642354.0,
"reward": 0.6801269054412842,
"reward_std": 0.3871152102947235,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6680933237075806,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3655979335308075,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.6603453755378723,
"adv/mean_abs_reasoning": 0.5115096569061279,
"adv/mean_abs_step_conf": 0.7684875726699829,
"adv/ratio_final_to_reasoning": 1.2909734285995282,
"adv/ratio_step_to_reasoning": 1.5023911323946235,
"adv/std_final_conf": 0.8369495272636414,
"adv/std_reasoning": 0.7754160761833191,
"adv/std_step_conf": 0.9356595277786255,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5251436781609196,
"calib/avg_num_step_conf": 7.94140625,
"calib/ece": 0.26373983739837403,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9512195121951219,
"calib/gap": 0.031005747126436667,
"calib/mean_conf": 0.9694308943089431,
"calib/mu_c": 0.9785057471264367,
"calib/mu_w": 0.9475,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2629268292682927,
"calib/std_conf": 0.07611289019392023,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9059385903698535,
"calib/step_q_c_n": 1433.0,
"calib/step_q_gap": 0.005038590369853546,
"calib/step_q_w": 0.9008999999999999,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2692.0,
"completions/max_terminated_length": 2692.0,
"completions/mean_length": 792.09765625,
"completions/mean_terminated_length": 798.3346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 350.0,
"epoch": 0.1152,
"grad_norm": 0.01923747919499874,
"kl": 0.1485748291015625,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0034,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0191725455224514,
"mask/share_reasoning": 0.85679030418396,
"mask/share_step_conf": 0.11622464656829834,
"num_tokens": 32948363.0,
"reward": 0.7075442671775818,
"reward_std": 0.37582361698150635,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7027539014816284,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.3834283649921417,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.62883460521698,
"adv/mean_abs_reasoning": 0.36270871758461,
"adv/mean_abs_step_conf": 0.7444422245025635,
"adv/ratio_final_to_reasoning": 1.733717924963549,
"adv/ratio_step_to_reasoning": 2.0524519770576113,
"adv/std_final_conf": 0.8395936489105225,
"adv/std_reasoning": 0.6612196564674377,
"adv/std_step_conf": 0.9360567927360535,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6210819672131147,
"calib/avg_num_step_conf": 8.47265625,
"calib/ece": 0.44287449392712563,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8785425101214575,
"calib/gap": 0.058613770491803274,
"calib/mean_conf": 0.9440890688259109,
"calib/mu_c": 0.9730400000000001,
"calib/mu_w": 0.9144262295081969,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4404453441295548,
"calib/std_conf": 0.1363487140527158,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.905908658420552,
"calib/step_q_c_n": 1051.0,
"calib/step_q_gap": 0.015935492052036815,
"calib/step_q_w": 0.8899731663685152,
"calib/step_q_w_n": 1118.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 788.91015625,
"completions/mean_terminated_length": 804.6255493164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.020503204315900803,
"kl": 0.163543701171875,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0104,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01863759383559227,
"mask/share_reasoning": 0.8506479263305664,
"mask/share_step_conf": 0.11118321120738983,
"num_tokens": 33254924.0,
"reward": 0.4792168438434601,
"reward_std": 0.3232100009918213,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.548890233039856,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.1181371733546257,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.6838954091072083,
"adv/mean_abs_reasoning": 0.4938781261444092,
"adv/mean_abs_step_conf": 0.7572989463806152,
"adv/ratio_final_to_reasoning": 1.3847452901917716,
"adv/ratio_step_to_reasoning": 1.5333721140733052,
"adv/std_final_conf": 0.8846848607063293,
"adv/std_reasoning": 0.7575966119766235,
"adv/std_step_conf": 0.9364821910858154,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5694776714513556,
"calib/avg_num_step_conf": 8.734375,
"calib/ece": 0.37115537848605595,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9243027888446215,
"calib/gap": 0.01622541201488581,
"calib/mean_conf": 0.965179282868526,
"calib/mu_c": 0.9715789473684211,
"calib/mu_w": 0.9553535353535353,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36537848605577705,
"calib/std_conf": 0.08533194925016266,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9116350123864575,
"calib/step_q_c_n": 1211.0,
"calib/step_q_gap": -0.005994255906225487,
"calib/step_q_w": 0.917629268292683,
"calib/step_q_w_n": 1025.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2505.0,
"completions/max_terminated_length": 2505.0,
"completions/mean_length": 742.86328125,
"completions/mean_terminated_length": 748.7125854492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.01990644447505474,
"kl": 0.1675872802734375,
"learning_rate": 2.5e-06,
"loss": 0.0079,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020663520321249962,
"mask/share_reasoning": 0.8460651636123657,
"mask/share_step_conf": 0.12545883655548096,
"num_tokens": 33550017.0,
"reward": 0.6041939854621887,
"reward_std": 0.40416789054870605,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6198753714561462,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.27445003390312195,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.7322089076042175,
"adv/mean_abs_reasoning": 0.5857294797897339,
"adv/mean_abs_step_conf": 0.7790755033493042,
"adv/ratio_final_to_reasoning": 1.2500803406157175,
"adv/ratio_step_to_reasoning": 1.33009440403952,
"adv/std_final_conf": 0.8689647912979126,
"adv/std_reasoning": 0.8266346454620361,
"adv/std_step_conf": 0.9363865256309509,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6608518047277842,
"calib/avg_num_step_conf": 7.80859375,
"calib/ece": 0.3606910569105691,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8252032520325203,
"calib/gap": 0.10853746735418202,
"calib/mean_conf": 0.9176016260162603,
"calib/mu_c": 0.9656934306569344,
"calib/mu_w": 0.8571559633027523,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3606910569105691,
"calib/std_conf": 0.18651581767040165,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.906980244590781,
"calib/step_q_c_n": 1063.0,
"calib/step_q_gap": 0.006221697582233898,
"calib/step_q_w": 0.9007585470085471,
"calib/step_q_w_n": 936.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3043.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 799.48828125,
"completions/mean_terminated_length": 808.9683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.1184,
"grad_norm": 0.027693284675478935,
"kl": 0.1625518798828125,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0956,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01975979655981064,
"mask/share_reasoning": 0.8559533357620239,
"mask/share_step_conf": 0.11256811767816544,
"num_tokens": 33862094.0,
"reward": 0.5763823986053467,
"reward_std": 0.4253450036048889,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6168433427810669,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.23592141270637512,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.7116504311561584,
"adv/mean_abs_reasoning": 0.5469608306884766,
"adv/mean_abs_step_conf": 0.7545567154884338,
"adv/ratio_final_to_reasoning": 1.3010994411800603,
"adv/ratio_step_to_reasoning": 1.3795443350827332,
"adv/std_final_conf": 0.8786104917526245,
"adv/std_reasoning": 0.7929294109344482,
"adv/std_step_conf": 0.9363153576850891,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6716371863430688,
"calib/avg_num_step_conf": 7.8203125,
"calib/ece": 0.32983673469387764,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7877551020408163,
"calib/gap": 0.12024818318935948,
"calib/mean_conf": 0.8827346938775512,
"calib/mu_c": 0.9327972027972027,
"calib/mu_w": 0.8125490196078432,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3144489795918368,
"calib/std_conf": 0.23465709028457812,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8976138279932547,
"calib/step_q_c_n": 1186.0,
"calib/step_q_gap": 0.00026088681678415604,
"calib/step_q_w": 0.8973529411764706,
"calib/step_q_w_n": 816.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 798.03125,
"completions/mean_terminated_length": 813.9282836914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.02171127125620842,
"kl": 0.1656036376953125,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0284,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018730318173766136,
"mask/share_reasoning": 0.8514347076416016,
"mask/share_step_conf": 0.11030372977256775,
"num_tokens": 34174310.0,
"reward": 0.5774442553520203,
"reward_std": 0.38559865951538086,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6421105265617371,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.20965296030044556,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.7501026391983032,
"adv/mean_abs_reasoning": 0.5847432017326355,
"adv/mean_abs_step_conf": 0.7957204580307007,
"adv/ratio_final_to_reasoning": 1.2827898417214532,
"adv/ratio_step_to_reasoning": 1.3608032648740929,
"adv/std_final_conf": 0.8902297019958496,
"adv/std_reasoning": 0.8099881410598755,
"adv/std_step_conf": 0.9362003803253174,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5998031763268632,
"calib/avg_num_step_conf": 8.3046875,
"calib/ece": 0.37061224489795924,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8163265306122449,
"calib/gap": 0.06876611918012743,
"calib/mean_conf": 0.9269387755102042,
"calib/mu_c": 0.9566906474820143,
"calib/mu_w": 0.8879245283018868,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3651020408163266,
"calib/std_conf": 0.15780385760827156,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9058223394898858,
"calib/step_q_c_n": 1137.0,
"calib/step_q_gap": 0.022546303089481246,
"calib/step_q_w": 0.8832760364004045,
"calib/step_q_w_n": 989.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 722.87890625,
"completions/mean_terminated_length": 743.2008056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.01958407275378704,
"kl": 0.196502685546875,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0467,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.020146111026406288,
"mask/share_reasoning": 0.8327199220657349,
"mask/share_step_conf": 0.1197902113199234,
"num_tokens": 34464567.0,
"reward": 0.5459908246994019,
"reward_std": 0.43909066915512085,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6068406105041504,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.18435978889465332,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6415958404541016,
"adv/mean_abs_reasoning": 0.44854453206062317,
"adv/mean_abs_step_conf": 0.7478067874908447,
"adv/ratio_final_to_reasoning": 1.4303949654822377,
"adv/ratio_step_to_reasoning": 1.6671851600896892,
"adv/std_final_conf": 0.8132436275482178,
"adv/std_reasoning": 0.739156186580658,
"adv/std_step_conf": 0.9348743557929993,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6588422656141808,
"calib/avg_num_step_conf": 8.20703125,
"calib/ece": 0.2722134387351779,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8537549407114624,
"calib/gap": 0.12430272815399512,
"calib/mean_conf": 0.9283399209486165,
"calib/mu_c": 0.9710843373493975,
"calib/mu_w": 0.8467816091954024,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2722134387351779,
"calib/std_conf": 0.16779555090747067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9076392961876832,
"calib/step_q_c_n": 1364.0,
"calib/step_q_gap": 0.016268875563531182,
"calib/step_q_w": 0.8913704206241521,
"calib/step_q_w_n": 737.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1476.0,
"completions/max_terminated_length": 1476.0,
"completions/mean_length": 699.87109375,
"completions/mean_terminated_length": 705.3818969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.1216,
"grad_norm": 0.018764222040772438,
"kl": 0.18096923828125,
"learning_rate": 2.388888888888889e-06,
"loss": 0.011,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021557755768299103,
"mask/share_reasoning": 0.8419281244277954,
"mask/share_step_conf": 0.1287015974521637,
"num_tokens": 34748758.0,
"reward": 0.7011272311210632,
"reward_std": 0.34007567167282104,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7196776866912842,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.35523295402526855,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.75199294090271,
"adv/mean_abs_reasoning": 0.4932301640510559,
"adv/mean_abs_step_conf": 0.7876946926116943,
"adv/ratio_final_to_reasoning": 1.5246288562855792,
"adv/ratio_step_to_reasoning": 1.5970124092616473,
"adv/std_final_conf": 0.8954560160636902,
"adv/std_reasoning": 0.7394359111785889,
"adv/std_step_conf": 0.9358144402503967,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.58203026481715,
"calib/avg_num_step_conf": 7.95703125,
"calib/ece": 0.43460317460317477,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7936507936507936,
"calib/gap": 0.06140983606557393,
"calib/mean_conf": 0.9187301587301587,
"calib/mu_c": 0.9504098360655738,
"calib/mu_w": 0.8889999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.43460317460317477,
"calib/std_conf": 0.14954154345008988,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9002152852529602,
"calib/step_q_c_n": 929.0,
"calib/step_q_gap": -0.007970635324657027,
"calib/step_q_w": 0.9081859205776173,
"calib/step_q_w_n": 1108.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2307.0,
"completions/max_terminated_length": 2307.0,
"completions/mean_length": 754.8359375,
"completions/mean_terminated_length": 757.796142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.03260897099971771,
"kl": 0.18072509765625,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0386,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019767742604017258,
"mask/share_reasoning": 0.8599216938018799,
"mask/share_step_conf": 0.11640430986881256,
"num_tokens": 35047260.0,
"reward": 0.46756333112716675,
"reward_std": 0.4043220579624176,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.5565828084945679,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.08791884034872055,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7125102281570435,
"adv/mean_abs_reasoning": 0.5437763929367065,
"adv/mean_abs_step_conf": 0.7766680717468262,
"adv/ratio_final_to_reasoning": 1.310300037684749,
"adv/ratio_step_to_reasoning": 1.4282857472947108,
"adv/std_final_conf": 0.8894214034080505,
"adv/std_reasoning": 0.7754684686660767,
"adv/std_step_conf": 0.9361055493354797,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6414127887512554,
"calib/avg_num_step_conf": 8.140625,
"calib/ece": 0.30568548387096767,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7096774193548387,
"calib/gap": 0.10666287244727146,
"calib/mean_conf": 0.867217741935484,
"calib/mu_c": 0.9115172413793105,
"calib/mu_w": 0.804854368932039,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2941129032258064,
"calib/std_conf": 0.22687729597110243,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9069122807017544,
"calib/step_q_c_n": 1140.0,
"calib/step_q_gap": 0.01308812815938154,
"calib/step_q_w": 0.8938241525423729,
"calib/step_q_w_n": 944.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2959.0,
"completions/max_terminated_length": 2959.0,
"completions/mean_length": 767.15234375,
"completions/mean_terminated_length": 782.4342651367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.021685892716050148,
"kl": 0.1681671142578125,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0286,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.019999029114842415,
"mask/share_reasoning": 0.8439480066299438,
"mask/share_step_conf": 0.11652170121669769,
"num_tokens": 35348171.0,
"reward": 0.5921616554260254,
"reward_std": 0.39703428745269775,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6525866985321045,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.2254866510629654,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7528530359268188,
"adv/mean_abs_reasoning": 0.5408031940460205,
"adv/mean_abs_step_conf": 0.7640414237976074,
"adv/ratio_final_to_reasoning": 1.3921016817492273,
"adv/ratio_step_to_reasoning": 1.4127901465992267,
"adv/std_final_conf": 0.9162237048149109,
"adv/std_reasoning": 0.7929183840751648,
"adv/std_step_conf": 0.9363798499107361,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6521139705882353,
"calib/avg_num_step_conf": 7.87890625,
"calib/ece": 0.3748987854251013,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.728744939271255,
"calib/gap": 0.1118034401260507,
"calib/mean_conf": 0.8834008097165993,
"calib/mu_c": 0.9372656250000002,
"calib/mu_w": 0.8254621848739495,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.37004048582995963,
"calib/std_conf": 0.19722922112952876,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.9014519230769231,
"calib/step_q_c_n": 1040.0,
"calib/step_q_gap": 0.0030179415006691412,
"calib/step_q_w": 0.898433981576254,
"calib/step_q_w_n": 977.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 723.44921875,
"completions/mean_terminated_length": 737.860595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 373.0,
"epoch": 0.1248,
"grad_norm": 0.026363488286733627,
"kl": 0.1810150146484375,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0393,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020078789442777634,
"mask/share_reasoning": 0.8408745527267456,
"mask/share_step_conf": 0.11951541900634766,
"num_tokens": 35639974.0,
"reward": 0.5382317304611206,
"reward_std": 0.4197598695755005,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5978370904922485,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.1895637810230255,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.676365315914154,
"adv/mean_abs_reasoning": 0.3761560320854187,
"adv/mean_abs_step_conf": 0.7523123025894165,
"adv/ratio_final_to_reasoning": 1.7980977525851902,
"adv/ratio_step_to_reasoning": 2.00000063382894,
"adv/std_final_conf": 0.892826497554779,
"adv/std_reasoning": 0.6815088391304016,
"adv/std_step_conf": 0.9354512095451355,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.705430463576159,
"calib/avg_num_step_conf": 8.625,
"calib/ece": 0.24362549800796823,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6374501992031872,
"calib/gap": 0.19732582781456953,
"calib/mean_conf": 0.8225099601593625,
"calib/mu_c": 0.9011258278145695,
"calib/mu_w": 0.7038,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23227091633466146,
"calib/std_conf": 0.25377719252646,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8957572383073497,
"calib/step_q_c_n": 1347.0,
"calib/step_q_gap": 0.019020885229533158,
"calib/step_q_w": 0.8767363530778165,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2662.0,
"completions/max_terminated_length": 2662.0,
"completions/mean_length": 764.70703125,
"completions/mean_terminated_length": 770.7283325195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.02284431643784046,
"kl": 0.16656494140625,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0102,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02022838033735752,
"mask/share_reasoning": 0.8493303656578064,
"mask/share_step_conf": 0.12262875586748123,
"num_tokens": 35939747.0,
"reward": 0.6449134349822998,
"reward_std": 0.24360501766204834,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7236628532409668,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.2521013617515564,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.7165407538414001,
"adv/mean_abs_reasoning": 0.5705307126045227,
"adv/mean_abs_step_conf": 0.7834902405738831,
"adv/ratio_final_to_reasoning": 1.25591968672524,
"adv/ratio_step_to_reasoning": 1.3732656687265465,
"adv/std_final_conf": 0.8990749716758728,
"adv/std_reasoning": 0.8099278211593628,
"adv/std_step_conf": 0.9363142251968384,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7174507124731603,
"calib/avg_num_step_conf": 7.9609375,
"calib/ece": 0.19856000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.472,
"calib/gap": 0.21342442579217902,
"calib/mean_conf": 0.74872,
"calib/mu_c": 0.84177304964539,
"calib/mu_w": 0.628348623853211,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1916400000000001,
"calib/std_conf": 0.28160532949502215,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8987666034155597,
"calib/step_q_c_n": 1054.0,
"calib/step_q_gap": 0.014356034309868648,
"calib/step_q_w": 0.884410569105691,
"calib/step_q_w_n": 984.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 826.421875,
"completions/mean_terminated_length": 832.9291381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.03922202065587044,
"kl": 0.1675872802734375,
"learning_rate": 2.25e-06,
"loss": -0.0041,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01915661431849003,
"mask/share_reasoning": 0.8651344776153564,
"mask/share_step_conf": 0.10789638757705688,
"num_tokens": 36256375.0,
"reward": 0.6312820315361023,
"reward_std": 0.356397807598114,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7280832529067993,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.22979329526424408,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.7109464406967163,
"adv/mean_abs_reasoning": 0.5058482885360718,
"adv/mean_abs_step_conf": 0.7477468252182007,
"adv/ratio_final_to_reasoning": 1.4054538817442674,
"adv/ratio_step_to_reasoning": 1.4782037266196646,
"adv/std_final_conf": 0.9009770154953003,
"adv/std_reasoning": 0.7753273844718933,
"adv/std_step_conf": 0.9359827041625977,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7323376977632295,
"calib/avg_num_step_conf": 7.74609375,
"calib/ece": 0.22012000000000015,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.528,
"calib/gap": 0.20368657937806878,
"calib/mean_conf": 0.7747600000000001,
"calib/mu_c": 0.8513461538461539,
"calib/mu_w": 0.6476595744680851,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18544000000000013,
"calib/std_conf": 0.28982087985512706,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9006685714285714,
"calib/step_q_c_n": 1225.0,
"calib/step_q_gap": 0.0040063022992836705,
"calib/step_q_w": 0.8966622691292877,
"calib/step_q_w_n": 758.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 735.25,
"completions/mean_terminated_length": 743.9683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.128,
"grad_norm": 0.028242425993084908,
"kl": 0.1798095703125,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0093,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020412635058164597,
"mask/share_reasoning": 0.8511585593223572,
"mask/share_step_conf": 0.11671006679534912,
"num_tokens": 36551287.0,
"reward": 0.6879016160964966,
"reward_std": 0.3711988925933838,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7291804552078247,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.3309977650642395,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7510311603546143,
"adv/mean_abs_reasoning": 0.5703496932983398,
"adv/mean_abs_step_conf": 0.7499484419822693,
"adv/ratio_final_to_reasoning": 1.3167906797869762,
"adv/ratio_step_to_reasoning": 1.3148923384972955,
"adv/std_final_conf": 0.9112793803215027,
"adv/std_reasoning": 0.8101462125778198,
"adv/std_step_conf": 0.936364471912384,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6045390070921985,
"calib/avg_num_step_conf": 7.49609375,
"calib/ece": 0.2352868852459017,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4918032786885246,
"calib/gap": 0.12740567375886536,
"calib/mean_conf": 0.7654508196721314,
"calib/mu_c": 0.8145333333333333,
"calib/mu_w": 0.687127659574468,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1929918032786886,
"calib/std_conf": 0.27767957127344967,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8997903930131003,
"calib/step_q_c_n": 1145.0,
"calib/step_q_gap": 0.002245173374857301,
"calib/step_q_w": 0.897545219638243,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2598.0,
"completions/max_terminated_length": 2598.0,
"completions/mean_length": 786.95703125,
"completions/mean_terminated_length": 809.0802612304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.030293744057416916,
"kl": 0.165740966796875,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0506,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01884043961763382,
"mask/share_reasoning": 0.8479143381118774,
"mask/share_step_conf": 0.10590147972106934,
"num_tokens": 36857804.0,
"reward": 0.6501889228820801,
"reward_std": 0.3920496702194214,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6897769570350647,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.3027883470058441,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.7212374210357666,
"adv/mean_abs_reasoning": 0.4761502146720886,
"adv/mean_abs_step_conf": 0.7682005167007446,
"adv/ratio_final_to_reasoning": 1.5147266530846,
"adv/ratio_step_to_reasoning": 1.6133574931385528,
"adv/std_final_conf": 0.9036675095558167,
"adv/std_reasoning": 0.7393293380737305,
"adv/std_step_conf": 0.9357211589813232,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6959190672153635,
"calib/avg_num_step_conf": 7.89453125,
"calib/ece": 0.1980555555555555,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.626984126984127,
"calib/gap": 0.19480246913580235,
"calib/mean_conf": 0.8317857142857144,
"calib/mu_c": 0.901358024691358,
"calib/mu_w": 0.7065555555555556,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19349206349206344,
"calib/std_conf": 0.24911029326107098,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9024658634538152,
"calib/step_q_c_n": 1245.0,
"calib/step_q_gap": 0.005571533556907982,
"calib/step_q_w": 0.8968943298969072,
"calib/step_q_w_n": 776.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 721.3984375,
"completions/mean_terminated_length": 729.9525756835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 372.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.024334616959095,
"kl": 0.19036865234375,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0019,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02030223235487938,
"mask/share_reasoning": 0.8518544435501099,
"mask/share_step_conf": 0.11612460017204285,
"num_tokens": 37149826.0,
"reward": 0.7178915739059448,
"reward_std": 0.34284746646881104,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7501226663589478,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.36300432682037354,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7650114297866821,
"adv/mean_abs_reasoning": 0.5763481259346008,
"adv/mean_abs_step_conf": 0.7651894092559814,
"adv/ratio_final_to_reasoning": 1.3273426170097224,
"adv/ratio_step_to_reasoning": 1.3276514225064189,
"adv/std_final_conf": 0.8993167281150818,
"adv/std_reasoning": 0.8100424408912659,
"adv/std_step_conf": 0.9361500144004822,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5572519083969466,
"calib/avg_num_step_conf": 8.13671875,
"calib/ece": 0.2889754098360656,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.46311475409836067,
"calib/gap": 0.06475511720597171,
"calib/mean_conf": 0.7262704918032789,
"calib/mu_c": 0.7562595419847328,
"calib/mu_w": 0.6915044247787611,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23918032786885252,
"calib/std_conf": 0.30409571769873533,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8969036027263877,
"calib/step_q_c_n": 1027.0,
"calib/step_q_gap": 0.007007769393054364,
"calib/step_q_w": 0.8898958333333333,
"calib/step_q_w_n": 1056.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 789.74609375,
"completions/mean_terminated_length": 815.2217407226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 361.0,
"epoch": 0.1312,
"grad_norm": 0.034101828932762146,
"kl": 0.1665191650390625,
"learning_rate": 2.138888888888889e-06,
"loss": -0.1115,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.018441390246152878,
"mask/share_reasoning": 0.8417726159095764,
"mask/share_step_conf": 0.1085360199213028,
"num_tokens": 37457289.0,
"reward": 0.5628241896629333,
"reward_std": 0.39414483308792114,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6245074272155762,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.20817218720912933,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.7271085977554321,
"adv/mean_abs_reasoning": 0.46908581256866455,
"adv/mean_abs_step_conf": 0.7774999737739563,
"adv/ratio_final_to_reasoning": 1.5500545492387885,
"adv/ratio_step_to_reasoning": 1.6574791923815564,
"adv/std_final_conf": 0.9254276156425476,
"adv/std_reasoning": 0.7206630706787109,
"adv/std_step_conf": 0.9363216161727905,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.680831265508685,
"calib/avg_num_step_conf": 8.03515625,
"calib/ece": 0.19795180722891564,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5622489959839357,
"calib/gap": 0.18590777502067835,
"calib/mean_conf": 0.7848594377510041,
"calib/mu_c": 0.8542948717948718,
"calib/mu_w": 0.6683870967741935,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17815261044176706,
"calib/std_conf": 0.2836779388908106,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9042563681183238,
"calib/step_q_c_n": 1217.0,
"calib/step_q_gap": 0.030018272880228403,
"calib/step_q_w": 0.8742380952380954,
"calib/step_q_w_n": 840.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2524.0,
"completions/max_terminated_length": 2524.0,
"completions/mean_length": 761.8359375,
"completions/mean_terminated_length": 777.011962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 309.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.028144368901848793,
"kl": 0.176025390625,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0094,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019312703981995583,
"mask/share_reasoning": 0.8514062762260437,
"mask/share_step_conf": 0.10974974930286407,
"num_tokens": 37759135.0,
"reward": 0.7117006778717041,
"reward_std": 0.3516343832015991,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.726138710975647,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.38163769245147705,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7411487102508545,
"adv/mean_abs_reasoning": 0.4857451319694519,
"adv/mean_abs_step_conf": 0.7906614542007446,
"adv/ratio_final_to_reasoning": 1.5257975046417238,
"adv/ratio_step_to_reasoning": 1.6277290335263075,
"adv/std_final_conf": 0.9077005386352539,
"adv/std_reasoning": 0.7575331330299377,
"adv/std_step_conf": 0.9356077313423157,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6455156371155337,
"calib/avg_num_step_conf": 7.6171875,
"calib/ece": 0.26476190476190475,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5793650793650794,
"calib/gap": 0.1416024812613078,
"calib/mean_conf": 0.7861904761904762,
"calib/mu_c": 0.8457534246575342,
"calib/mu_w": 0.7041509433962264,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2357936507936508,
"calib/std_conf": 0.2885386198542446,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9060358744394619,
"calib/step_q_c_n": 1115.0,
"calib/step_q_gap": 0.007401143900539586,
"calib/step_q_w": 0.8986347305389223,
"calib/step_q_w_n": 835.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 802.55078125,
"completions/mean_terminated_length": 808.8700561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.026144176721572876,
"kl": 0.166656494140625,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0016,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019170530140399933,
"mask/share_reasoning": 0.8632123470306396,
"mask/share_step_conf": 0.10980463027954102,
"num_tokens": 38069396.0,
"reward": 0.6401241421699524,
"reward_std": 0.35746702551841736,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6850773692131042,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.28501462936401367,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.6534983515739441,
"adv/mean_abs_reasoning": 0.4111436605453491,
"adv/mean_abs_step_conf": 0.7711385488510132,
"adv/ratio_final_to_reasoning": 1.5894647401522157,
"adv/ratio_step_to_reasoning": 1.8755939172895422,
"adv/std_final_conf": 0.8342850804328918,
"adv/std_reasoning": 0.7013512253761292,
"adv/std_step_conf": 0.9359316229820251,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7610365302697819,
"calib/avg_num_step_conf": 8.12890625,
"calib/ece": 0.27228915662650605,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5983935742971888,
"calib/gap": 0.29177423518781487,
"calib/mean_conf": 0.7622489959839358,
"calib/mu_c": 0.9110655737704918,
"calib/mu_w": 0.6192913385826769,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27228915662650605,
"calib/std_conf": 0.3206377966421584,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9034669338677356,
"calib/step_q_c_n": 998.0,
"calib/step_q_gap": 0.023679306905593167,
"calib/step_q_w": 0.8797876269621424,
"calib/step_q_w_n": 1083.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2871.0,
"completions/max_terminated_length": 2871.0,
"completions/mean_length": 784.76953125,
"completions/mean_terminated_length": 797.2262573242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.1344,
"grad_norm": 0.027145879343152046,
"kl": 0.16473388671875,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0515,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019449051469564438,
"mask/share_reasoning": 0.8491480350494385,
"mask/share_step_conf": 0.11577790975570679,
"num_tokens": 38375761.0,
"reward": 0.5556277632713318,
"reward_std": 0.29540807008743286,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6993195414543152,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.12209223210811615,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.7096794247627258,
"adv/mean_abs_reasoning": 0.45027339458465576,
"adv/mean_abs_step_conf": 0.7957907915115356,
"adv/ratio_final_to_reasoning": 1.5761078342577917,
"adv/ratio_step_to_reasoning": 1.7673502389489266,
"adv/std_final_conf": 0.8831758499145508,
"adv/std_reasoning": 0.7207463979721069,
"adv/std_step_conf": 0.9359408020973206,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7281753965080635,
"calib/avg_num_step_conf": 7.984375,
"calib/ece": 0.285469387755102,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6326530612244898,
"calib/gap": 0.255379181660669,
"calib/mean_conf": 0.7834285714285716,
"calib/mu_c": 0.9116393442622951,
"calib/mu_w": 0.6562601626016261,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.285469387755102,
"calib/std_conf": 0.31052503267589776,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9082281059063136,
"calib/step_q_c_n": 982.0,
"calib/step_q_gap": 0.024923021160550896,
"calib/step_q_w": 0.8833050847457627,
"calib/step_q_w_n": 1062.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 784.03125,
"completions/mean_terminated_length": 793.328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.02291964553296566,
"kl": 0.1801605224609375,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0462,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019718363881111145,
"mask/share_reasoning": 0.8533198237419128,
"mask/share_step_conf": 0.1152430847287178,
"num_tokens": 38680145.0,
"reward": 0.5593506097793579,
"reward_std": 0.35522782802581787,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6697046756744385,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.16227777302265167,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7020530700683594,
"adv/mean_abs_reasoning": 0.5561271905899048,
"adv/mean_abs_step_conf": 0.7891441583633423,
"adv/ratio_final_to_reasoning": 1.2623965919085265,
"adv/ratio_step_to_reasoning": 1.4189994154507493,
"adv/std_final_conf": 0.8692341446876526,
"adv/std_reasoning": 0.775589644908905,
"adv/std_step_conf": 0.9360777735710144,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6191236413043478,
"calib/avg_num_step_conf": 7.87109375,
"calib/ece": 0.3292181069958847,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6255144032921811,
"calib/gap": 0.12339470108695638,
"calib/mean_conf": 0.794650205761317,
"calib/mu_c": 0.853046875,
"calib/mu_w": 0.7296521739130436,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.298559670781893,
"calib/std_conf": 0.29337854076085734,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9101501501501501,
"calib/step_q_c_n": 999.0,
"calib/step_q_gap": 0.02262751235487459,
"calib/step_q_w": 0.8875226377952755,
"calib/step_q_w_n": 1016.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2972.0,
"completions/max_terminated_length": 2972.0,
"completions/mean_length": 762.4453125,
"completions/mean_terminated_length": 783.8795166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 402.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.023058714345097542,
"kl": 0.17486572265625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0412,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.019529111683368683,
"mask/share_reasoning": 0.8425556421279907,
"mask/share_step_conf": 0.1105714738368988,
"num_tokens": 38981995.0,
"reward": 0.5060725808143616,
"reward_std": 0.402359277009964,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.617258608341217,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.10582408308982849,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.5893697142601013,
"adv/mean_abs_reasoning": 0.4902503490447998,
"adv/mean_abs_step_conf": 0.7598936557769775,
"adv/ratio_final_to_reasoning": 1.2021811211525397,
"adv/ratio_step_to_reasoning": 1.55001145283741,
"adv/std_final_conf": 0.8214588761329651,
"adv/std_reasoning": 0.7574241161346436,
"adv/std_step_conf": 0.935619592666626,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6773800105485231,
"calib/avg_num_step_conf": 8.30078125,
"calib/ece": 0.2732677165354331,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7480314960629921,
"calib/gap": 0.1619712552742616,
"calib/mean_conf": 0.8681496062992126,
"calib/mu_c": 0.929367088607595,
"calib/mu_w": 0.7673958333333334,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25968503937007875,
"calib/std_conf": 0.24575955148379136,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9087342814371258,
"calib/step_q_c_n": 1336.0,
"calib/step_q_gap": 0.0032463219947935773,
"calib/step_q_w": 0.9054879594423322,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2073.0,
"completions/max_terminated_length": 2073.0,
"completions/mean_length": 724.203125,
"completions/mean_terminated_length": 727.0431518554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.1376,
"grad_norm": 0.020458418875932693,
"kl": 0.17547607421875,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0043,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020674947649240494,
"mask/share_reasoning": 0.8490486741065979,
"mask/share_step_conf": 0.12637010216712952,
"num_tokens": 39269775.0,
"reward": 0.6816619634628296,
"reward_std": 0.3260408043861389,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7115363478660583,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3306938707828522,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.6116889715194702,
"adv/mean_abs_reasoning": 0.32955822348594666,
"adv/mean_abs_step_conf": 0.7390551567077637,
"adv/ratio_final_to_reasoning": 1.8560877196425185,
"adv/ratio_step_to_reasoning": 2.24256323781063,
"adv/std_final_conf": 0.8465979099273682,
"adv/std_reasoning": 0.6402920484542847,
"adv/std_step_conf": 0.9348527193069458,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6206658854346176,
"calib/avg_num_step_conf": 7.921875,
"calib/ece": 0.2692063492063493,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7936507936507936,
"calib/gap": 0.1499165919900739,
"calib/mean_conf": 0.8553968253968254,
"calib/mu_c": 0.9083435582822086,
"calib/mu_w": 0.7584269662921347,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23888888888888898,
"calib/std_conf": 0.28490502676442225,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9076360808709176,
"calib/step_q_c_n": 1286.0,
"calib/step_q_gap": -0.0015957250590016292,
"calib/step_q_w": 0.9092318059299193,
"calib/step_q_w_n": 742.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 690.96484375,
"completions/mean_terminated_length": 693.674560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.03185328468680382,
"kl": 0.1677703857421875,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0024,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02168242260813713,
"mask/share_reasoning": 0.8475232124328613,
"mask/share_step_conf": 0.12688809633255005,
"num_tokens": 39551950.0,
"reward": 0.7144206166267395,
"reward_std": 0.25448256731033325,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7040468454360962,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.40135687589645386,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6819969415664673,
"adv/mean_abs_reasoning": 0.4341234564781189,
"adv/mean_abs_step_conf": 0.7591865658760071,
"adv/ratio_final_to_reasoning": 1.5709746418662867,
"adv/ratio_step_to_reasoning": 1.7487803401249118,
"adv/std_final_conf": 0.8898735046386719,
"adv/std_reasoning": 0.720470130443573,
"adv/std_step_conf": 0.9359843730926514,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6664562848790858,
"calib/avg_num_step_conf": 8.0703125,
"calib/ece": 0.3766935483870968,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6451612903225806,
"calib/gap": 0.20516077597661453,
"calib/mean_conf": 0.7817741935483871,
"calib/mu_c": 0.8992452830188681,
"calib/mu_w": 0.6940845070422536,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36552419354838717,
"calib/std_conf": 0.31874008115395064,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9083021077283373,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": 0.005991876705235044,
"calib/step_q_w": 0.9023102310231023,
"calib/step_q_w_n": 1212.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 746.39453125,
"completions/mean_terminated_length": 761.2630004882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 413.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.038372308015823364,
"kl": 0.167083740234375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0181,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.019734080880880356,
"mask/share_reasoning": 0.8474262952804565,
"mask/share_step_conf": 0.113308385014534,
"num_tokens": 39849235.0,
"reward": 0.44620394706726074,
"reward_std": 0.3412773311138153,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/final_brier_reward_step": 0.6088827848434448,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.006962493062019348,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6559932231903076,
"adv/mean_abs_reasoning": 0.5363863706588745,
"adv/mean_abs_step_conf": 0.787875235080719,
"adv/ratio_final_to_reasoning": 1.2229863752587768,
"adv/ratio_step_to_reasoning": 1.46885767084821,
"adv/std_final_conf": 0.8319321274757385,
"adv/std_reasoning": 0.775436282157898,
"adv/std_step_conf": 0.9356468319892883,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6849190938511327,
"calib/avg_num_step_conf": 7.95703125,
"calib/ece": 0.26268774703557296,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6996047430830039,
"calib/gap": 0.21572362459546934,
"calib/mean_conf": 0.810909090909091,
"calib/mu_c": 0.8987333333333334,
"calib/mu_w": 0.683009708737864,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24035573122529633,
"calib/std_conf": 0.30704051379424063,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9063180272108845,
"calib/step_q_c_n": 1176.0,
"calib/step_q_gap": 0.016527086444333983,
"calib/step_q_w": 0.8897909407665505,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1964.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 740.9375,
"completions/mean_terminated_length": 746.7716674804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 310.0,
"epoch": 0.1408,
"grad_norm": 0.019422784447669983,
"kl": 0.172088623046875,
"learning_rate": 1.888888888888889e-06,
"loss": 0.013,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.020528219640254974,
"mask/share_reasoning": 0.8521432876586914,
"mask/share_step_conf": 0.11951600760221481,
"num_tokens": 40144507.0,
"reward": 0.6724216341972351,
"reward_std": 0.36871108412742615,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7125101685523987,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.31748926639556885,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7555452585220337,
"adv/mean_abs_reasoning": 0.5980292558670044,
"adv/mean_abs_step_conf": 0.7885544896125793,
"adv/ratio_final_to_reasoning": 1.263391800835341,
"adv/ratio_step_to_reasoning": 1.3185884835506205,
"adv/std_final_conf": 0.9083207845687866,
"adv/std_reasoning": 0.8267114162445068,
"adv/std_step_conf": 0.9366989135742188,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6685675182481752,
"calib/avg_num_step_conf": 8.1484375,
"calib/ece": 0.3265261044176707,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6345381526104418,
"calib/gap": 0.22614735401459862,
"calib/mean_conf": 0.7602610441767068,
"calib/mu_c": 0.8846875000000001,
"calib/mu_w": 0.6585401459854014,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3184939759036145,
"calib/std_conf": 0.3388102739520392,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9094806629834254,
"calib/step_q_c_n": 905.0,
"calib/step_q_gap": 0.011123338681986006,
"calib/step_q_w": 0.8983573243014394,
"calib/step_q_w_n": 1181.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 835.5390625,
"completions/mean_terminated_length": 845.4466552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.033650532364845276,
"kl": 0.148590087890625,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.031,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018224865198135376,
"mask/share_reasoning": 0.8637900948524475,
"mask/share_step_conf": 0.10626627504825592,
"num_tokens": 40464749.0,
"reward": 0.49485090374946594,
"reward_std": 0.4596877694129944,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6354119777679443,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.07225852459669113,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7146537899971008,
"adv/mean_abs_reasoning": 0.5521727800369263,
"adv/mean_abs_step_conf": 0.7653998136520386,
"adv/ratio_final_to_reasoning": 1.294257550959518,
"adv/ratio_step_to_reasoning": 1.386159987098337,
"adv/std_final_conf": 0.8777416944503784,
"adv/std_reasoning": 0.7928855419158936,
"adv/std_step_conf": 0.9364771246910095,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6981589147286822,
"calib/avg_num_step_conf": 7.3671875,
"calib/ece": 0.24196787148594356,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5662650602409639,
"calib/gap": 0.266874031007752,
"calib/mean_conf": 0.7275100401606427,
"calib/mu_c": 0.8561240310077519,
"calib/mu_w": 0.5892499999999999,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2257028112449797,
"calib/std_conf": 0.3459951959233053,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9080306230200634,
"calib/step_q_c_n": 947.0,
"calib/step_q_gap": 0.013051922274589511,
"calib/step_q_w": 0.8949787007454739,
"calib/step_q_w_n": 939.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 848.98046875,
"completions/mean_terminated_length": 855.6653442382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 386.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.02200193516910076,
"kl": 0.155487060546875,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0334,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.017995350062847137,
"mask/share_reasoning": 0.8758010268211365,
"mask/share_step_conf": 0.09839113801717758,
"num_tokens": 40791040.0,
"reward": 0.5666602849960327,
"reward_std": 0.4101633131504059,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6965745687484741,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.14221477508544922,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7415114045143127,
"adv/mean_abs_reasoning": 0.5965020060539246,
"adv/mean_abs_step_conf": 0.7876029014587402,
"adv/ratio_final_to_reasoning": 1.2430995989765024,
"adv/ratio_step_to_reasoning": 1.3203692417885011,
"adv/std_final_conf": 0.8925070762634277,
"adv/std_reasoning": 0.8268001079559326,
"adv/std_step_conf": 0.936475396156311,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6228057656629085,
"calib/avg_num_step_conf": 8.1484375,
"calib/ece": 0.2920539419087136,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6556016597510373,
"calib/gap": 0.1683380904809476,
"calib/mean_conf": 0.7717219917012449,
"calib/mu_c": 0.8401748251748252,
"calib/mu_w": 0.6718367346938776,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23520746887966795,
"calib/std_conf": 0.3404964058245485,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9003955288048152,
"calib/step_q_c_n": 1163.0,
"calib/step_q_gap": 0.000774727071337411,
"calib/step_q_w": 0.8996208017334778,
"calib/step_q_w_n": 923.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2609.0,
"completions/max_terminated_length": 2609.0,
"completions/mean_length": 756.12109375,
"completions/mean_terminated_length": 786.857666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.144,
"grad_norm": 0.03226546198129654,
"kl": 0.165618896484375,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0553,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018887124955654144,
"mask/share_reasoning": 0.8286978006362915,
"mask/share_step_conf": 0.11335258930921555,
"num_tokens": 41090487.0,
"reward": 0.6198797225952148,
"reward_std": 0.39496392011642456,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6516416668891907,
"rewards/format_reward_step": 0.94140625,
"rewards/step_correlation_reward": 0.28811773657798767,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6814755797386169,
"adv/mean_abs_reasoning": 0.5314260721206665,
"adv/mean_abs_step_conf": 0.7463881969451904,
"adv/ratio_final_to_reasoning": 1.28235255191597,
"adv/ratio_step_to_reasoning": 1.4045005243470898,
"adv/std_final_conf": 0.8753324747085571,
"adv/std_reasoning": 0.7927984595298767,
"adv/std_step_conf": 0.9362938404083252,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6861426100326735,
"calib/avg_num_step_conf": 8.0390625,
"calib/ece": 0.2838400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.596,
"calib/gap": 0.22379012108398988,
"calib/mean_conf": 0.7393600000000001,
"calib/mu_c": 0.8476744186046511,
"calib/mu_w": 0.6238842975206612,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2536000000000001,
"calib/std_conf": 0.352307238642637,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8973889392565729,
"calib/step_q_c_n": 1103.0,
"calib/step_q_gap": 0.023891557057620028,
"calib/step_q_w": 0.8734973821989529,
"calib/step_q_w_n": 955.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 763.12109375,
"completions/mean_terminated_length": 772.1699829101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.021460261195898056,
"kl": 0.167144775390625,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0334,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019838891923427582,
"mask/share_reasoning": 0.8512438535690308,
"mask/share_step_conf": 0.11719851195812225,
"num_tokens": 41394334.0,
"reward": 0.5539348125457764,
"reward_std": 0.3586582541465759,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.671519935131073,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.14103710651397705,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6621338725090027,
"adv/mean_abs_reasoning": 0.46420344710350037,
"adv/mean_abs_step_conf": 0.7904796600341797,
"adv/ratio_final_to_reasoning": 1.4263872374075048,
"adv/ratio_step_to_reasoning": 1.7028733090341135,
"adv/std_final_conf": 0.8399810791015625,
"adv/std_reasoning": 0.7393754720687866,
"adv/std_step_conf": 0.9360789656639099,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7326352415966386,
"calib/avg_num_step_conf": 8.20703125,
"calib/ece": 0.2327016129032258,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": 0.2830672268907565,
"calib/mean_conf": 0.755766129032258,
"calib/mu_c": 0.8836029411764706,
"calib/mu_w": 0.6005357142857142,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22004032258064515,
"calib/std_conf": 0.3423734428027801,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9086728110599079,
"calib/step_q_c_n": 1085.0,
"calib/step_q_gap": 0.03330863783156135,
"calib/step_q_w": 0.8753641732283466,
"calib/step_q_w_n": 1016.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2837.0,
"completions/max_terminated_length": 2837.0,
"completions/mean_length": 759.3359375,
"completions/mean_terminated_length": 777.56005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.023186106234788895,
"kl": 0.162078857421875,
"learning_rate": 1.75e-06,
"loss": -0.0652,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01922602951526642,
"mask/share_reasoning": 0.8434792757034302,
"mask/share_step_conf": 0.1138571947813034,
"num_tokens": 41695708.0,
"reward": 0.6702703237533569,
"reward_std": 0.33385735750198364,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7055327892303467,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.3357889652252197,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.6598755121231079,
"adv/mean_abs_reasoning": 0.5995461344718933,
"adv/mean_abs_step_conf": 0.777741551399231,
"adv/ratio_final_to_reasoning": 1.1006250798436976,
"adv/ratio_step_to_reasoning": 1.2972171892731825,
"adv/std_final_conf": 0.86130690574646,
"adv/std_reasoning": 0.8266723155975342,
"adv/std_step_conf": 0.9359835386276245,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7679413498379016,
"calib/avg_num_step_conf": 7.8515625,
"calib/ece": 0.1534115226337449,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6090534979423868,
"calib/gap": 0.3785459770114941,
"calib/mean_conf": 0.7286378600823046,
"calib/mu_c": 0.8641666666666666,
"calib/mu_w": 0.4856206896551725,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12003703703703711,
"calib/std_conf": 0.3585229766999016,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9093198380566804,
"calib/step_q_c_n": 1235.0,
"calib/step_q_gap": 0.034623063863131964,
"calib/step_q_w": 0.8746967741935484,
"calib/step_q_w_n": 775.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2940.0,
"completions/max_terminated_length": 2940.0,
"completions/mean_length": 779.4453125,
"completions/mean_terminated_length": 801.357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.1472,
"grad_norm": 0.01940912939608097,
"kl": 0.168487548828125,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0544,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.019332313910126686,
"mask/share_reasoning": 0.8415598273277283,
"mask/share_step_conf": 0.1117640882730484,
"num_tokens": 41999582.0,
"reward": 0.7093103528022766,
"reward_std": 0.3921882212162018,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7631781101226807,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.34450504183769226,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.589346170425415,
"adv/mean_abs_reasoning": 0.378071665763855,
"adv/mean_abs_step_conf": 0.7459247708320618,
"adv/ratio_final_to_reasoning": 1.558821312977003,
"adv/ratio_step_to_reasoning": 1.9729718949580561,
"adv/std_final_conf": 0.8030606508255005,
"adv/std_reasoning": 0.6613706946372986,
"adv/std_step_conf": 0.9352872967720032,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.757540684624018,
"calib/avg_num_step_conf": 8.3359375,
"calib/ece": 0.20812000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.664,
"calib/gap": 0.27966750841750854,
"calib/mean_conf": 0.7945200000000001,
"calib/mu_c": 0.8929629629629631,
"calib/mu_w": 0.6132954545454545,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17732000000000006,
"calib/std_conf": 0.3130724670104352,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9137986270022883,
"calib/step_q_c_n": 1311.0,
"calib/step_q_gap": 0.036860595410550756,
"calib/step_q_w": 0.8769380315917376,
"calib/step_q_w_n": 823.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2747.0,
"completions/max_terminated_length": 2747.0,
"completions/mean_length": 711.0703125,
"completions/mean_terminated_length": 719.5020141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.0213282760232687,
"kl": 0.1831817626953125,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0103,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02053791843354702,
"mask/share_reasoning": 0.8478147983551025,
"mask/share_step_conf": 0.11992855370044708,
"num_tokens": 42284712.0,
"reward": 0.7505369782447815,
"reward_std": 0.2918218970298767,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7617222666740417,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.41747671365737915,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.6343496441841125,
"adv/mean_abs_reasoning": 0.48369088768959045,
"adv/mean_abs_step_conf": 0.7830361127853394,
"adv/ratio_final_to_reasoning": 1.3114773511946076,
"adv/ratio_step_to_reasoning": 1.6188771232090158,
"adv/std_final_conf": 0.845893919467926,
"adv/std_reasoning": 0.7394121289253235,
"adv/std_step_conf": 0.9360425472259521,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.710564125831821,
"calib/avg_num_step_conf": 8.2734375,
"calib/ece": 0.20199999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.668,
"calib/gap": 0.27235329703569255,
"calib/mean_conf": 0.7664000000000001,
"calib/mu_c": 0.8491954022988505,
"calib/mu_w": 0.5768421052631579,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13619999999999993,
"calib/std_conf": 0.3427509883282614,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.91,
"calib/step_q_c_n": 1397.0,
"calib/step_q_gap": 0.03300970873786402,
"calib/step_q_w": 0.876990291262136,
"calib/step_q_w_n": 721.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2406.0,
"completions/max_terminated_length": 2406.0,
"completions/mean_length": 735.140625,
"completions/mean_terminated_length": 743.8577270507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.03776135668158531,
"kl": 0.1652679443359375,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0303,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020328957587480545,
"mask/share_reasoning": 0.8499683141708374,
"mask/share_step_conf": 0.11798396706581116,
"num_tokens": 42577924.0,
"reward": 0.7632162570953369,
"reward_std": 0.31106436252593994,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7629226446151733,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.43225979804992676,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.572924017906189,
"adv/mean_abs_reasoning": 0.39419761300086975,
"adv/mean_abs_step_conf": 0.7634836435317993,
"adv/ratio_final_to_reasoning": 1.4533929151542704,
"adv/ratio_step_to_reasoning": 1.9368043294826212,
"adv/std_final_conf": 0.7978944778442383,
"adv/std_reasoning": 0.681702733039856,
"adv/std_step_conf": 0.9354971647262573,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7728465083303793,
"calib/avg_num_step_conf": 7.96875,
"calib/ece": 0.16658536585365846,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6463414634146342,
"calib/gap": 0.35866997518610433,
"calib/mean_conf": 0.7498373983739838,
"calib/mu_c": 0.8825161290322581,
"calib/mu_w": 0.5238461538461537,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.143170731707317,
"calib/std_conf": 0.3539245331800658,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9111320754716982,
"calib/step_q_c_n": 1219.0,
"calib/step_q_gap": 0.08411624112334248,
"calib/step_q_w": 0.8270158343483557,
"calib/step_q_w_n": 821.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2627.0,
"completions/max_terminated_length": 2627.0,
"completions/mean_length": 755.140625,
"completions/mean_terminated_length": 776.3694458007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.1504,
"grad_norm": 0.02357516810297966,
"kl": 0.168670654296875,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0829,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019246093928813934,
"mask/share_reasoning": 0.8440717458724976,
"mask/share_step_conf": 0.1093384325504303,
"num_tokens": 42878336.0,
"reward": 0.7093539237976074,
"reward_std": 0.2913316488265991,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.76347815990448,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.3419484794139862,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.6818134784698486,
"adv/mean_abs_reasoning": 0.5208780169487,
"adv/mean_abs_step_conf": 0.7369670867919922,
"adv/ratio_final_to_reasoning": 1.3089695788351898,
"adv/ratio_step_to_reasoning": 1.4148554225980596,
"adv/std_final_conf": 0.8804184794425964,
"adv/std_reasoning": 0.792790949344635,
"adv/std_step_conf": 0.9357167482376099,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7144522921108741,
"calib/avg_num_step_conf": 8.21875,
"calib/ece": 0.23434959349593493,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5975609756097561,
"calib/gap": 0.29858208955223897,
"calib/mean_conf": 0.7026422764227643,
"calib/mu_c": 0.8385820895522389,
"calib/mu_w": 0.5399999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19613821138211376,
"calib/std_conf": 0.3787011527454767,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9123612417685795,
"calib/step_q_c_n": 1063.0,
"calib/step_q_gap": 0.033677284035630195,
"calib/step_q_w": 0.8786839577329493,
"calib/step_q_w_n": 1041.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2641.0,
"completions/max_terminated_length": 2641.0,
"completions/mean_length": 803.51953125,
"completions/mean_terminated_length": 813.0474853515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.029727516695857048,
"kl": 0.1576995849609375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.034,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01919659972190857,
"mask/share_reasoning": 0.855238139629364,
"mask/share_step_conf": 0.11384646594524384,
"num_tokens": 43189197.0,
"reward": 0.6065089702606201,
"reward_std": 0.327970951795578,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6992902755737305,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.21763396263122559,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6424644589424133,
"adv/mean_abs_reasoning": 0.4570094048976898,
"adv/mean_abs_step_conf": 0.7492420077323914,
"adv/ratio_final_to_reasoning": 1.4058013950199584,
"adv/ratio_step_to_reasoning": 1.6394454899678121,
"adv/std_final_conf": 0.8636579513549805,
"adv/std_reasoning": 0.7393165230751038,
"adv/std_step_conf": 0.9361722469329834,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7209853638425066,
"calib/avg_num_step_conf": 7.8671875,
"calib/ece": 0.2088821138211382,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5934959349593496,
"calib/gap": 0.2949958771387342,
"calib/mean_conf": 0.712540650406504,
"calib/mu_c": 0.8312585034013605,
"calib/mu_w": 0.5362626262626263,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16193089430894309,
"calib/std_conf": 0.3624702796535149,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9037253218884121,
"calib/step_q_c_n": 1165.0,
"calib/step_q_gap": 0.01621059868464292,
"calib/step_q_w": 0.8875147232037692,
"calib/step_q_w_n": 849.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3064.0,
"completions/max_terminated_length": 3064.0,
"completions/mean_length": 779.90625,
"completions/mean_terminated_length": 795.4422607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.02792339399456978,
"kl": 0.1596221923828125,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.054,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0190900769084692,
"mask/share_reasoning": 0.8507193922996521,
"mask/share_step_conf": 0.11065928637981415,
"num_tokens": 43496189.0,
"reward": 0.6489737629890442,
"reward_std": 0.2787196934223175,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7272330522537231,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.26368314027786255,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.6125003099441528,
"adv/mean_abs_reasoning": 0.43078356981277466,
"adv/mean_abs_step_conf": 0.7561758160591125,
"adv/ratio_final_to_reasoning": 1.4218283910186156,
"adv/ratio_step_to_reasoning": 1.7553497139822638,
"adv/std_final_conf": 0.8434417247772217,
"adv/std_reasoning": 0.7205666303634644,
"adv/std_step_conf": 0.9356434345245361,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7194492254733218,
"calib/avg_num_step_conf": 8.11328125,
"calib/ece": 0.19257999999999986,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.572,
"calib/gap": 0.33783921399885264,
"calib/mean_conf": 0.67498,
"calib/mu_c": 0.7884939759036146,
"calib/mu_w": 0.45065476190476195,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10177999999999988,
"calib/std_conf": 0.38638025260098374,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9077133907595929,
"calib/step_q_c_n": 1277.0,
"calib/step_q_gap": 0.0018883907595927951,
"calib/step_q_w": 0.9058250000000001,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 750.02734375,
"completions/mean_terminated_length": 758.9209594726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 367.0,
"epoch": 0.1536,
"grad_norm": 0.03318783640861511,
"kl": 0.18798828125,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0073,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020067322999238968,
"mask/share_reasoning": 0.85340416431427,
"mask/share_step_conf": 0.11480970680713654,
"num_tokens": 43792324.0,
"reward": 0.7210659980773926,
"reward_std": 0.2872735261917114,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7599924802780151,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.3571394085884094,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6912169456481934,
"adv/mean_abs_reasoning": 0.6103887557983398,
"adv/mean_abs_step_conf": 0.7602620124816895,
"adv/ratio_final_to_reasoning": 1.1324208368552542,
"adv/ratio_step_to_reasoning": 1.2455373813158261,
"adv/std_final_conf": 0.8755853772163391,
"adv/std_reasoning": 0.826539158821106,
"adv/std_step_conf": 0.9361149668693542,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6756991837280878,
"calib/avg_num_step_conf": 8.44140625,
"calib/ece": 0.22039525691699607,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5968379446640316,
"calib/gap": 0.25610196708149313,
"calib/mean_conf": 0.7237154150197629,
"calib/mu_c": 0.8188679245283018,
"calib/mu_w": 0.5627659574468087,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15782608695652178,
"calib/std_conf": 0.36024424910245134,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.907763358778626,
"calib/step_q_c_n": 1310.0,
"calib/step_q_gap": 0.021899669001892663,
"calib/step_q_w": 0.8858636897767334,
"calib/step_q_w_n": 851.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2375.0,
"completions/max_terminated_length": 2375.0,
"completions/mean_length": 737.9765625,
"completions/mean_terminated_length": 740.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.03555729612708092,
"kl": 0.178863525390625,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0632,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.020503666251897812,
"mask/share_reasoning": 0.8497222661972046,
"mask/share_step_conf": 0.1258678436279297,
"num_tokens": 44083950.0,
"reward": 0.6904496550559998,
"reward_std": 0.34167811274528503,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7384937405586243,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.3205304741859436,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.6599738597869873,
"adv/mean_abs_reasoning": 0.3847285509109497,
"adv/mean_abs_step_conf": 0.7628105282783508,
"adv/ratio_final_to_reasoning": 1.7154273012083956,
"adv/ratio_step_to_reasoning": 1.9827239919475406,
"adv/std_final_conf": 0.85041743516922,
"adv/std_reasoning": 0.6613991856575012,
"adv/std_step_conf": 0.9356829524040222,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.725688924218336,
"calib/avg_num_step_conf": 8.41015625,
"calib/ece": 0.24115384615384622,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5384615384615384,
"calib/gap": 0.3158220720720719,
"calib/mean_conf": 0.6631781376518217,
"calib/mu_c": 0.8370720720720719,
"calib/mu_w": 0.52125,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22746963562753042,
"calib/std_conf": 0.3803313607572731,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9075369978858352,
"calib/step_q_c_n": 946.0,
"calib/step_q_gap": 0.015275191754932083,
"calib/step_q_w": 0.8922618061309031,
"calib/step_q_w_n": 1207.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 777.671875,
"completions/mean_terminated_length": 793.1633911132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.046256691217422485,
"kl": 0.169403076171875,
"learning_rate": 1.5e-06,
"loss": -0.0331,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019423943012952805,
"mask/share_reasoning": 0.8471709489822388,
"mask/share_step_conf": 0.11387380957603455,
"num_tokens": 44390250.0,
"reward": 0.47675973176956177,
"reward_std": 0.30644434690475464,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6932385563850403,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": -0.019406616687774658,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6734507083892822,
"adv/mean_abs_reasoning": 0.316013365983963,
"adv/mean_abs_step_conf": 0.7818354368209839,
"adv/ratio_final_to_reasoning": 2.1310829885070697,
"adv/ratio_step_to_reasoning": 2.474058128480111,
"adv/std_final_conf": 0.861054003238678,
"adv/std_reasoning": 0.6402580142021179,
"adv/std_step_conf": 0.9357953071594238,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6833056754065715,
"calib/avg_num_step_conf": 8.38671875,
"calib/ece": 0.30921951219512195,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6300813008130082,
"calib/gap": 0.2612223033521406,
"calib/mean_conf": 0.7102764227642276,
"calib/mu_c": 0.8493826086956521,
"calib/mu_w": 0.5881603053435115,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2760081300813008,
"calib/std_conf": 0.3834393095109842,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9026067761806983,
"calib/step_q_c_n": 974.0,
"calib/step_q_gap": -0.0051928828133340765,
"calib/step_q_w": 0.9077996589940324,
"calib/step_q_w_n": 1173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 789.7265625,
"completions/mean_terminated_length": 799.0909423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.1568,
"grad_norm": 0.03732088953256607,
"kl": 0.164581298828125,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0063,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01913021132349968,
"mask/share_reasoning": 0.8517061471939087,
"mask/share_step_conf": 0.11744489520788193,
"num_tokens": 44696100.0,
"reward": 0.4662114679813385,
"reward_std": 0.27114027738571167,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6487675905227661,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.00162409245967865,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.6306105852127075,
"adv/mean_abs_reasoning": 0.5097656846046448,
"adv/mean_abs_step_conf": 0.7392554879188538,
"adv/ratio_final_to_reasoning": 1.2370597006775486,
"adv/ratio_step_to_reasoning": 1.4501868412193981,
"adv/std_final_conf": 0.8591131567955017,
"adv/std_reasoning": 0.7752926349639893,
"adv/std_step_conf": 0.9363844394683838,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7374319361782955,
"calib/avg_num_step_conf": 7.99609375,
"calib/ece": 0.1976470588235294,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6313725490196078,
"calib/gap": 0.3357699126250474,
"calib/mean_conf": 0.7266666666666667,
"calib/mu_c": 0.866241610738255,
"calib/mu_w": 0.5304716981132076,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16999999999999998,
"calib/std_conf": 0.3668021140201801,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9106073211314477,
"calib/step_q_c_n": 1202.0,
"calib/step_q_gap": 0.029175368468725704,
"calib/step_q_w": 0.881431952662722,
"calib/step_q_w_n": 845.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1587.0,
"completions/max_terminated_length": 1587.0,
"completions/mean_length": 711.46484375,
"completions/mean_terminated_length": 714.2549438476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.03166365996003151,
"kl": 0.1800537109375,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0061,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.021450942382216454,
"mask/share_reasoning": 0.8483710289001465,
"mask/share_step_conf": 0.1262717992067337,
"num_tokens": 44983347.0,
"reward": 0.6800941228866577,
"reward_std": 0.3421480059623718,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7624218463897705,
"rewards/format_reward_step": 0.99609375,
"rewards/step_correlation_reward": 0.28214138746261597,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6537437438964844,
"adv/mean_abs_reasoning": 0.5020466446876526,
"adv/mean_abs_step_conf": 0.7768505811691284,
"adv/ratio_final_to_reasoning": 1.3021573808210787,
"adv/ratio_step_to_reasoning": 1.547367340045554,
"adv/std_final_conf": 0.8794245719909668,
"adv/std_reasoning": 0.7576271891593933,
"adv/std_step_conf": 0.9356343746185303,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7993790064102564,
"calib/avg_num_step_conf": 7.71875,
"calib/ece": 0.12939516129032252,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5685483870967742,
"calib/gap": 0.45433226495726514,
"calib/mean_conf": 0.662459677419355,
"calib/mu_c": 0.8529861111111112,
"calib/mu_w": 0.3986538461538461,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10560483870967735,
"calib/std_conf": 0.3929397043788211,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.910442238267148,
"calib/step_q_c_n": 1108.0,
"calib/step_q_gap": 0.051386938727977394,
"calib/step_q_w": 0.8590552995391706,
"calib/step_q_w_n": 868.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2819.0,
"completions/max_terminated_length": 2819.0,
"completions/mean_length": 772.22265625,
"completions/mean_terminated_length": 787.6055908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 264.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.02984035573899746,
"kl": 0.157073974609375,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0238,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.019209206104278564,
"mask/share_reasoning": 0.8516172170639038,
"mask/share_step_conf": 0.10964234173297882,
"num_tokens": 45285492.0,
"reward": 0.6858113408088684,
"reward_std": 0.32667478919029236,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7875117063522339,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.27942347526550293,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.65451580286026,
"adv/mean_abs_reasoning": 0.52001953125,
"adv/mean_abs_step_conf": 0.7654784917831421,
"adv/ratio_final_to_reasoning": 1.2586369617444249,
"adv/ratio_step_to_reasoning": 1.4720187334947183,
"adv/std_final_conf": 0.8596464395523071,
"adv/std_reasoning": 0.7754071354866028,
"adv/std_step_conf": 0.9360762238502502,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7629945181956277,
"calib/avg_num_step_conf": 8.28515625,
"calib/ece": 0.18471999999999988,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.636,
"calib/gap": 0.3906043193976619,
"calib/mean_conf": 0.71152,
"calib/mu_c": 0.8724489795918368,
"calib/mu_w": 0.4818446601941749,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1541199999999999,
"calib/std_conf": 0.37992432088509415,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9113223140495869,
"calib/step_q_c_n": 1210.0,
"calib/step_q_gap": 0.05469223721094785,
"calib/step_q_w": 0.856630076838639,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2380.0,
"completions/max_terminated_length": 2380.0,
"completions/mean_length": 729.2734375,
"completions/mean_terminated_length": 735.0157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.16,
"grad_norm": 0.02691521868109703,
"kl": 0.175537109375,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0028,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.021217215806245804,
"mask/share_reasoning": 0.8411681056022644,
"mask/share_step_conf": 0.12980221211910248,
"num_tokens": 45577146.0,
"reward": 0.6711744666099548,
"reward_std": 0.3526858687400818,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.765114426612854,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.2686406373977661,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.7199978232383728,
"adv/mean_abs_reasoning": 0.550804853439331,
"adv/mean_abs_step_conf": 0.756540060043335,
"adv/ratio_final_to_reasoning": 1.307174072164703,
"adv/ratio_step_to_reasoning": 1.3735174178647007,
"adv/std_final_conf": 0.8927524089813232,
"adv/std_reasoning": 0.7930209040641785,
"adv/std_step_conf": 0.9363460540771484,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7288725258393569,
"calib/avg_num_step_conf": 8.33984375,
"calib/ece": 0.21118852459016396,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.42213114754098363,
"calib/gap": 0.30035195568465867,
"calib/mean_conf": 0.5674180327868852,
"calib/mu_c": 0.7286725663716815,
"calib/mu_w": 0.4283206106870229,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15774590163934427,
"calib/std_conf": 0.39256550753677466,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8955125523012553,
"calib/step_q_c_n": 956.0,
"calib/step_q_gap": 0.04628439284408803,
"calib/step_q_w": 0.8492281594571672,
"calib/step_q_w_n": 1179.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2781.0,
"completions/max_terminated_length": 2781.0,
"completions/mean_length": 780.08203125,
"completions/mean_terminated_length": 798.8040161132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.02839108370244503,
"kl": 0.1691131591796875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0788,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.019112911075353622,
"mask/share_reasoning": 0.8431797027587891,
"mask/share_step_conf": 0.11426986753940582,
"num_tokens": 45883871.0,
"reward": 0.5250996947288513,
"reward_std": 0.3253830671310425,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7012449502944946,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.07004818320274353,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7303016781806946,
"adv/mean_abs_reasoning": 0.5549091696739197,
"adv/mean_abs_step_conf": 0.7628487348556519,
"adv/ratio_final_to_reasoning": 1.3160742660097697,
"adv/ratio_step_to_reasoning": 1.3747272104080084,
"adv/std_final_conf": 0.9063445925712585,
"adv/std_reasoning": 0.8098983764648438,
"adv/std_step_conf": 0.9365310668945312,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7526132404181185,
"calib/avg_num_step_conf": 8.62890625,
"calib/ece": 0.19837349397590362,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.46184738955823296,
"calib/gap": 0.3666753774680604,
"calib/mean_conf": 0.5768473895582329,
"calib/mu_c": 0.7579761904761905,
"calib/mu_w": 0.3913008130081301,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13459839357429718,
"calib/std_conf": 0.4078563324934811,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9114836795252227,
"calib/step_q_c_n": 1011.0,
"calib/step_q_gap": 0.06239352927480535,
"calib/step_q_w": 0.8490901502504173,
"calib/step_q_w_n": 1198.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 772.8046875,
"completions/mean_terminated_length": 781.9683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.035195719450712204,
"kl": 0.16558837890625,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020163387060165405,
"mask/share_reasoning": 0.8471139669418335,
"mask/share_step_conf": 0.12100391089916229,
"num_tokens": 46187101.0,
"reward": 0.6216672658920288,
"reward_std": 0.3602669835090637,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7372526526451111,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.21389436721801758,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7047197222709656,
"adv/mean_abs_reasoning": 0.3971008062362671,
"adv/mean_abs_step_conf": 0.7533491849899292,
"adv/ratio_final_to_reasoning": 1.7746620283910262,
"adv/ratio_step_to_reasoning": 1.8971232824485917,
"adv/std_final_conf": 0.8927444219589233,
"adv/std_reasoning": 0.7014403343200684,
"adv/std_step_conf": 0.9356233477592468,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6776244588744588,
"calib/avg_num_step_conf": 7.734375,
"calib/ece": 0.27427999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.496,
"calib/gap": 0.24861471861471868,
"calib/mean_conf": 0.60148,
"calib/mu_c": 0.696948051948052,
"calib/mu_w": 0.4483333333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12987999999999997,
"calib/std_conf": 0.4081673793923272,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9095986895986897,
"calib/step_q_c_n": 1221.0,
"calib/step_q_gap": 0.035152049282484166,
"calib/step_q_w": 0.8744466403162056,
"calib/step_q_w_n": 759.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2600.0,
"completions/max_terminated_length": 2600.0,
"completions/mean_length": 777.4921875,
"completions/mean_terminated_length": 783.6141967773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 324.0,
"epoch": 0.1632,
"grad_norm": 0.04205191507935524,
"kl": 0.1555328369140625,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0431,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019509706646203995,
"mask/share_reasoning": 0.8606699705123901,
"mask/share_step_conf": 0.11200778931379318,
"num_tokens": 46493459.0,
"reward": 0.6556740999221802,
"reward_std": 0.2867233157157898,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6975207328796387,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.2982025146484375,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6398210525512695,
"adv/mean_abs_reasoning": 0.4637221693992615,
"adv/mean_abs_step_conf": 0.7866086959838867,
"adv/ratio_final_to_reasoning": 1.3797508395601163,
"adv/ratio_step_to_reasoning": 1.6962930562558942,
"adv/std_final_conf": 0.8462924957275391,
"adv/std_reasoning": 0.739355206489563,
"adv/std_step_conf": 0.9352414011955261,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7172987616099071,
"calib/avg_num_step_conf": 8.08984375,
"calib/ece": 0.21296,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.35551212590299275,
"calib/mean_conf": 0.59296,
"calib/mu_c": 0.7550735294117646,
"calib/mu_w": 0.3995614035087719,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13096000000000002,
"calib/std_conf": 0.41283318471266334,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9078578199052132,
"calib/step_q_c_n": 1055.0,
"calib/step_q_gap": 0.03562356793670918,
"calib/step_q_w": 0.872234251968504,
"calib/step_q_w_n": 1016.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 761.72265625,
"completions/mean_terminated_length": 770.7549438476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.030612580478191376,
"kl": 0.184112548828125,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0058,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019752800464630127,
"mask/share_reasoning": 0.8524797558784485,
"mask/share_step_conf": 0.11604867875576019,
"num_tokens": 46792900.0,
"reward": 0.6149193048477173,
"reward_std": 0.31094467639923096,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7377804517745972,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.190495565533638,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.7606242895126343,
"adv/mean_abs_reasoning": 0.5110421776771545,
"adv/mean_abs_step_conf": 0.7742916941642761,
"adv/ratio_final_to_reasoning": 1.4883786950226066,
"adv/ratio_step_to_reasoning": 1.5151228763224054,
"adv/std_final_conf": 0.9109428524971008,
"adv/std_reasoning": 0.775321364402771,
"adv/std_step_conf": 0.9362097382545471,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6389382778765558,
"calib/avg_num_step_conf": 8.36328125,
"calib/ece": 0.2739840637450198,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.44223107569721115,
"calib/gap": 0.22977647955295888,
"calib/mean_conf": 0.5523904382470118,
"calib/mu_c": 0.6659055118110234,
"calib/mu_w": 0.43612903225806454,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16019920318725092,
"calib/std_conf": 0.4111017445483834,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.893562271062271,
"calib/step_q_c_n": 1092.0,
"calib/step_q_gap": 0.007642347325378718,
"calib/step_q_w": 0.8859199237368923,
"calib/step_q_w_n": 1049.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 739.1953125,
"completions/mean_terminated_length": 742.0941772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.04050329327583313,
"kl": 0.18646240234375,
"learning_rate": 1.25e-06,
"loss": 0.0259,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020515194162726402,
"mask/share_reasoning": 0.8497653007507324,
"mask/share_step_conf": 0.125813290476799,
"num_tokens": 47089350.0,
"reward": 0.5554911494255066,
"reward_std": 0.3071201741695404,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6801988482475281,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.1354709416627884,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6814758777618408,
"adv/mean_abs_reasoning": 0.46522659063339233,
"adv/mean_abs_step_conf": 0.7735946178436279,
"adv/ratio_final_to_reasoning": 1.4648257246732854,
"adv/ratio_step_to_reasoning": 1.66283405424097,
"adv/std_final_conf": 0.8751667737960815,
"adv/std_reasoning": 0.7393137812614441,
"adv/std_step_conf": 0.9359323978424072,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5959168472372698,
"calib/avg_num_step_conf": 8.296875,
"calib/ece": 0.3109756097560975,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4715447154471545,
"calib/gap": 0.15321641386782242,
"calib/mean_conf": 0.5764227642276422,
"calib/mu_c": 0.6411971830985916,
"calib/mu_w": 0.4879807692307692,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15508130081300808,
"calib/std_conf": 0.4108352192399877,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.898228176318064,
"calib/step_q_c_n": 1157.0,
"calib/step_q_gap": 0.0323853634948994,
"calib/step_q_w": 0.8658428128231646,
"calib/step_q_w_n": 967.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 760.60546875,
"completions/mean_terminated_length": 766.594482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 252.0,
"epoch": 0.1664,
"grad_norm": 0.031293194741010666,
"kl": 0.1655731201171875,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019975952804088593,
"mask/share_reasoning": 0.848551332950592,
"mask/share_step_conf": 0.12366022914648056,
"num_tokens": 47388825.0,
"reward": 0.6052007675170898,
"reward_std": 0.2788010835647583,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.634109377861023,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.27394840121269226,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6583760976791382,
"adv/mean_abs_reasoning": 0.4844781756401062,
"adv/mean_abs_step_conf": 0.7745144367218018,
"adv/ratio_final_to_reasoning": 1.3589386081411678,
"adv/ratio_step_to_reasoning": 1.5986570204085901,
"adv/std_final_conf": 0.8895961046218872,
"adv/std_reasoning": 0.7755122184753418,
"adv/std_step_conf": 0.9363909363746643,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7374674702095604,
"calib/avg_num_step_conf": 8.95703125,
"calib/ece": 0.20248987854251008,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5587044534412956,
"calib/gap": 0.37048691960005486,
"calib/mean_conf": 0.6547165991902834,
"calib/mu_c": 0.8017114093959732,
"calib/mu_w": 0.4312244897959183,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12698380566801615,
"calib/std_conf": 0.3979885161097442,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8932716807367613,
"calib/step_q_c_n": 1303.0,
"calib/step_q_gap": 0.04945349891857942,
"calib/step_q_w": 0.8438181818181819,
"calib/step_q_w_n": 990.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1865.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 729.78125,
"completions/mean_terminated_length": 753.3225708007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.02866934984922409,
"kl": 0.171966552734375,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.1051,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019679686054587364,
"mask/share_reasoning": 0.8270054459571838,
"mask/share_step_conf": 0.12206491827964783,
"num_tokens": 47679377.0,
"reward": 0.6636831164360046,
"reward_std": 0.33132559061050415,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7471444606781006,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.27162790298461914,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.6822168231010437,
"adv/mean_abs_reasoning": 0.5764622688293457,
"adv/mean_abs_step_conf": 0.7968577742576599,
"adv/ratio_final_to_reasoning": 1.1834544253632762,
"adv/ratio_step_to_reasoning": 1.3823242514655534,
"adv/std_final_conf": 0.8623555898666382,
"adv/std_reasoning": 0.7929839491844177,
"adv/std_step_conf": 0.936363160610199,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6801173139158576,
"calib/avg_num_step_conf": 8.29296875,
"calib/ece": 0.25137651821862356,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.708502024291498,
"calib/gap": 0.284259708737864,
"calib/mean_conf": 0.7627125506072874,
"calib/mu_c": 0.8812500000000001,
"calib/mu_w": 0.5969902912621361,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2155465587044535,
"calib/std_conf": 0.36784789691879916,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9059918032786884,
"calib/step_q_c_n": 1220.0,
"calib/step_q_gap": 0.04028859176152344,
"calib/step_q_w": 0.865703211517165,
"calib/step_q_w_n": 903.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2871.0,
"completions/max_terminated_length": 2871.0,
"completions/mean_length": 737.34765625,
"completions/mean_terminated_length": 749.0516357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 330.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0269920751452446,
"kl": 0.1685638427734375,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0251,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020759008824825287,
"mask/share_reasoning": 0.8345186710357666,
"mask/share_step_conf": 0.12909731268882751,
"num_tokens": 47973378.0,
"reward": 0.6014918684959412,
"reward_std": 0.4104700982570648,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.701915979385376,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.19559898972511292,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.6884654760360718,
"adv/mean_abs_reasoning": 0.526401162147522,
"adv/mean_abs_step_conf": 0.7543092370033264,
"adv/ratio_final_to_reasoning": 1.3078722570204582,
"adv/ratio_step_to_reasoning": 1.43295511340823,
"adv/std_final_conf": 0.8764867186546326,
"adv/std_reasoning": 0.8098902106285095,
"adv/std_step_conf": 0.9364614486694336,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6864791288566244,
"calib/avg_num_step_conf": 8.44921875,
"calib/ece": 0.27042168674698797,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5863453815261044,
"calib/gap": 0.28616800622245275,
"calib/mean_conf": 0.653714859437751,
"calib/mu_c": 0.78703007518797,
"calib/mu_w": 0.5008620689655172,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19500000000000003,
"calib/std_conf": 0.4122793373772121,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9098790322580644,
"calib/step_q_c_n": 1116.0,
"calib/step_q_gap": 0.029883807807252527,
"calib/step_q_w": 0.8799952244508119,
"calib/step_q_w_n": 1047.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 741.390625,
"completions/mean_terminated_length": 756.1593627929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 356.0,
"epoch": 0.1696,
"grad_norm": 0.03104994259774685,
"kl": 0.17242431640625,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0501,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020054301247000694,
"mask/share_reasoning": 0.8368874192237854,
"mask/share_step_conf": 0.12352701276540756,
"num_tokens": 48267958.0,
"reward": 0.6443827152252197,
"reward_std": 0.3411294221878052,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6898362636566162,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.301272988319397,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.7009190320968628,
"adv/mean_abs_reasoning": 0.5025119185447693,
"adv/mean_abs_step_conf": 0.7787777781486511,
"adv/ratio_final_to_reasoning": 1.3948306621794428,
"adv/ratio_step_to_reasoning": 1.5497697654692921,
"adv/std_final_conf": 0.8760209679603577,
"adv/std_reasoning": 0.7579127550125122,
"adv/std_step_conf": 0.9360190033912659,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.6175496688741722,
"calib/avg_num_step_conf": 9.06640625,
"calib/ece": 0.29861924686192465,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5815899581589958,
"calib/gap": 0.18110776640577975,
"calib/mean_conf": 0.6735146443514644,
"calib/mu_c": 0.7401986754966888,
"calib/mu_w": 0.5590909090909091,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17016736401673635,
"calib/std_conf": 0.39207445123159396,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.893084935897436,
"calib/step_q_c_n": 1248.0,
"calib/step_q_gap": 0.045927433567519826,
"calib/step_q_w": 0.8471575023299162,
"calib/step_q_w_n": 1073.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2582.0,
"completions/max_terminated_length": 2582.0,
"completions/mean_length": 707.03125,
"completions/mean_terminated_length": 744.85595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.04126443713903427,
"kl": 0.1840667724609375,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0968,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.019329741597175598,
"mask/share_reasoning": 0.8126784563064575,
"mask/share_step_conf": 0.11721055209636688,
"num_tokens": 48553798.0,
"reward": 0.6182776689529419,
"reward_std": 0.3836175203323364,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6499402523040771,
"rewards/format_reward_step": 0.93359375,
"rewards/step_correlation_reward": 0.28192758560180664,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.572830855846405,
"adv/mean_abs_reasoning": 0.3294076919555664,
"adv/mean_abs_step_conf": 0.7416273355484009,
"adv/ratio_final_to_reasoning": 1.7389723125338368,
"adv/ratio_step_to_reasoning": 2.251396532806036,
"adv/std_final_conf": 0.8279606103897095,
"adv/std_reasoning": 0.6402148008346558,
"adv/std_step_conf": 0.9353938102722168,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6893785883147585,
"calib/avg_num_step_conf": 8.98046875,
"calib/ece": 0.22601593625497995,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6334661354581673,
"calib/gap": 0.27145558932792946,
"calib/mean_conf": 0.7069721115537849,
"calib/mu_c": 0.7751063829787233,
"calib/mu_w": 0.5036507936507938,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09199203187250984,
"calib/std_conf": 0.3878477686573863,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9108514605344934,
"calib/step_q_c_n": 1609.0,
"calib/step_q_gap": 0.025170301114203375,
"calib/step_q_w": 0.88568115942029,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 729.03515625,
"completions/mean_terminated_length": 734.7755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.04238247498869896,
"kl": 0.174163818359375,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.028,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.021196896210312843,
"mask/share_reasoning": 0.8361594080924988,
"mask/share_step_conf": 0.1348312497138977,
"num_tokens": 48844351.0,
"reward": 0.7657527327537537,
"reward_std": 0.283169686794281,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7430897951126099,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.4462280869483948,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5648688673973083,
"adv/mean_abs_reasoning": 0.4343217611312866,
"adv/mean_abs_step_conf": 0.7546055316925049,
"adv/ratio_final_to_reasoning": 1.3005769407592727,
"adv/ratio_step_to_reasoning": 1.7374343153494514,
"adv/std_final_conf": 0.8103326559066772,
"adv/std_reasoning": 0.7014553546905518,
"adv/std_step_conf": 0.9359780550003052,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7571895424836601,
"calib/avg_num_step_conf": 8.17578125,
"calib/ece": 0.18587250996015942,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6414342629482072,
"calib/gap": 0.37191169208424096,
"calib/mean_conf": 0.7117689243027888,
"calib/mu_c": 0.8317882352941176,
"calib/mu_w": 0.45987654320987664,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11017529880478091,
"calib/std_conf": 0.38622836894578855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9041127063890884,
"calib/step_q_c_n": 1393.0,
"calib/step_q_gap": 0.03745556353194557,
"calib/step_q_w": 0.8666571428571428,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2009.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 713.2265625,
"completions/mean_terminated_length": 718.842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.1728,
"grad_norm": 0.022894565016031265,
"kl": 0.178131103515625,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0356,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.021328045055270195,
"mask/share_reasoning": 0.8429559469223022,
"mask/share_step_conf": 0.127903550863266,
"num_tokens": 49131081.0,
"reward": 0.7984813451766968,
"reward_std": 0.2730226218700409,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.778145968914032,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.48991042375564575,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.6256484985351562,
"adv/mean_abs_reasoning": 0.48577702045440674,
"adv/mean_abs_step_conf": 0.774186372756958,
"adv/ratio_final_to_reasoning": 1.287933500744664,
"adv/ratio_step_to_reasoning": 1.5937072775339738,
"adv/std_final_conf": 0.8259115815162659,
"adv/std_reasoning": 0.7394855618476868,
"adv/std_step_conf": 0.9359018206596375,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7147538697531725,
"calib/avg_num_step_conf": 8.875,
"calib/ece": 0.22711522633744846,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5679012345679012,
"calib/gap": 0.30850432296750807,
"calib/mean_conf": 0.6559218106995885,
"calib/mu_c": 0.7841478873239438,
"calib/mu_w": 0.47564356435643573,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1493374485596707,
"calib/std_conf": 0.4039821305301361,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8987555953446733,
"calib/step_q_c_n": 1117.0,
"calib/step_q_gap": 0.12455646114553909,
"calib/step_q_w": 0.7741991341991342,
"calib/step_q_w_n": 1155.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 771.44921875,
"completions/mean_terminated_length": 789.9640502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 288.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.03064306080341339,
"kl": 0.16400146484375,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0444,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01990315318107605,
"mask/share_reasoning": 0.8389712572097778,
"mask/share_step_conf": 0.11768805980682373,
"num_tokens": 49433404.0,
"reward": 0.6272368431091309,
"reward_std": 0.3007856011390686,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7011457085609436,
"rewards/format_reward_step": 0.94921875,
"rewards/step_correlation_reward": 0.2525468170642853,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.648464024066925,
"adv/mean_abs_reasoning": 0.4645434021949768,
"adv/mean_abs_step_conf": 0.74281245470047,
"adv/ratio_final_to_reasoning": 1.3959169821440143,
"adv/ratio_step_to_reasoning": 1.5990162624001683,
"adv/std_final_conf": 0.8585214018821716,
"adv/std_reasoning": 0.7393715381622314,
"adv/std_step_conf": 0.9360941052436829,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7796841684434969,
"calib/avg_num_step_conf": 8.609375,
"calib/ece": 0.15414634146341474,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5203252032520326,
"calib/gap": 0.43533448827292126,
"calib/mean_conf": 0.6191869918699187,
"calib/mu_c": 0.8173880597014928,
"calib/mu_w": 0.3820535714285715,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.114308943089431,
"calib/std_conf": 0.4069240026046992,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8863724624889673,
"calib/step_q_c_n": 1133.0,
"calib/step_q_gap": 0.0132912299959701,
"calib/step_q_w": 0.8730812324929972,
"calib/step_q_w_n": 1071.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 803.91015625,
"completions/mean_terminated_length": 823.2040405273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.0358455553650856,
"kl": 0.165863037109375,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0161,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01811538077890873,
"mask/share_reasoning": 0.8500458002090454,
"mask/share_step_conf": 0.10840128362178802,
"num_tokens": 49745341.0,
"reward": 0.6460665464401245,
"reward_std": 0.3104490339756012,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7656679749488831,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.22959008812904358,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5592621564865112,
"adv/mean_abs_reasoning": 0.4206353425979614,
"adv/mean_abs_step_conf": 0.7922811508178711,
"adv/ratio_final_to_reasoning": 1.329565302412184,
"adv/ratio_step_to_reasoning": 1.883534431330761,
"adv/std_final_conf": 0.8088847398757935,
"adv/std_reasoning": 0.6817131042480469,
"adv/std_step_conf": 0.9359111189842224,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.760966306420852,
"calib/avg_num_step_conf": 8.69140625,
"calib/ece": 0.2168525896414342,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5976095617529881,
"calib/gap": 0.3785435473617293,
"calib/mean_conf": 0.6782071713147411,
"calib/mu_c": 0.8606923076923078,
"calib/mu_w": 0.48214876033057846,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18856573705179278,
"calib/std_conf": 0.3996058477726149,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9013636363636365,
"calib/step_q_c_n": 1078.0,
"calib/step_q_gap": 0.020980026947769015,
"calib/step_q_w": 0.8803836094158675,
"calib/step_q_w_n": 1147.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1935.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 773.25,
"completions/mean_terminated_length": 782.4190063476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.176,
"grad_norm": 0.027194950729608536,
"kl": 0.169952392578125,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0021,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019431117922067642,
"mask/share_reasoning": 0.8503310680389404,
"mask/share_step_conf": 0.11851904541254044,
"num_tokens": 50048869.0,
"reward": 0.5861892700195312,
"reward_std": 0.3149217367172241,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7353046536445618,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.14098012447357178,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.6314215064048767,
"adv/mean_abs_reasoning": 0.4578407108783722,
"adv/mean_abs_step_conf": 0.7630418539047241,
"adv/ratio_final_to_reasoning": 1.3791292285770917,
"adv/ratio_step_to_reasoning": 1.6666098836881944,
"adv/std_final_conf": 0.8481122851371765,
"adv/std_reasoning": 0.7393897771835327,
"adv/std_step_conf": 0.9354910850524902,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7865882352941176,
"calib/avg_num_step_conf": 8.30859375,
"calib/ece": 0.1518734693877551,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6285714285714286,
"calib/gap": 0.46196274509803925,
"calib/mean_conf": 0.6988122448979591,
"calib/mu_c": 0.8402294117647059,
"calib/mu_w": 0.37826666666666664,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07840408163265303,
"calib/std_conf": 0.39420472958813463,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8941734906315059,
"calib/step_q_c_n": 1441.0,
"calib/step_q_gap": 0.03326970054404221,
"calib/step_q_w": 0.8609037900874636,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 772.12109375,
"completions/mean_terminated_length": 787.5020141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.026412660256028175,
"kl": 0.166351318359375,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0237,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019667625427246094,
"mask/share_reasoning": 0.838019847869873,
"mask/share_step_conf": 0.12278129160404205,
"num_tokens": 50352716.0,
"reward": 0.7692021131515503,
"reward_std": 0.28766733407974243,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7928230166435242,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.42136237025260925,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.5876627564430237,
"adv/mean_abs_reasoning": 0.46079394221305847,
"adv/mean_abs_step_conf": 0.7655038833618164,
"adv/ratio_final_to_reasoning": 1.2753265670565272,
"adv/ratio_step_to_reasoning": 1.661271586352297,
"adv/std_final_conf": 0.8277955651283264,
"adv/std_reasoning": 0.7207027673721313,
"adv/std_step_conf": 0.9362152218818665,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6482921318447634,
"calib/avg_num_step_conf": 8.40625,
"calib/ece": 0.27191235059760954,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7250996015936255,
"calib/gap": 0.24071703880914408,
"calib/mean_conf": 0.7774900398406376,
"calib/mu_c": 0.8724342105263159,
"calib/mu_w": 0.6317171717171718,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22191235059760955,
"calib/std_conf": 0.36075572915192167,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8928288973384032,
"calib/step_q_c_n": 1315.0,
"calib/step_q_gap": -0.00788794853973307,
"calib/step_q_w": 0.9007168458781363,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 762.58203125,
"completions/mean_terminated_length": 765.5725708007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.0374019555747509,
"kl": 0.1687774658203125,
"learning_rate": 9.166666666666666e-07,
"loss": 0.03,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019959034398198128,
"mask/share_reasoning": 0.8518633842468262,
"mask/share_step_conf": 0.12427132576704025,
"num_tokens": 50653545.0,
"reward": 0.6505559682846069,
"reward_std": 0.3292909264564514,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7024472951889038,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.28382089734077454,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.6799878478050232,
"adv/mean_abs_reasoning": 0.5563756227493286,
"adv/mean_abs_step_conf": 0.7683614492416382,
"adv/ratio_final_to_reasoning": 1.2221740493317539,
"adv/ratio_step_to_reasoning": 1.3810120677911484,
"adv/std_final_conf": 0.8610654473304749,
"adv/std_reasoning": 0.7931175827980042,
"adv/std_step_conf": 0.9361603260040283,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7151613357756799,
"calib/avg_num_step_conf": 9.99609375,
"calib/ece": 0.20934693877551025,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6244897959183674,
"calib/gap": 0.34080597435536125,
"calib/mean_conf": 0.7038775510204082,
"calib/mu_c": 0.834635761589404,
"calib/mu_w": 0.49382978723404275,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14844897959183678,
"calib/std_conf": 0.3908964909524009,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8802591973244147,
"calib/step_q_c_n": 1196.0,
"calib/step_q_gap": 0.03288575638530966,
"calib/step_q_w": 0.847373440939105,
"calib/step_q_w_n": 1363.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 815.10546875,
"completions/mean_terminated_length": 831.3426513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 354.0,
"epoch": 0.1792,
"grad_norm": 0.025498026981949806,
"kl": 0.1545257568359375,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0058,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018487967550754547,
"mask/share_reasoning": 0.8505588173866272,
"mask/share_step_conf": 0.11142198741436005,
"num_tokens": 50966884.0,
"reward": 0.6711308360099792,
"reward_std": 0.3440842032432556,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7275401949882507,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.306127667427063,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.598463237285614,
"adv/mean_abs_reasoning": 0.4369748830795288,
"adv/mean_abs_step_conf": 0.731543779373169,
"adv/ratio_final_to_reasoning": 1.369559808719474,
"adv/ratio_step_to_reasoning": 1.6741094458740984,
"adv/std_final_conf": 0.8424903154373169,
"adv/std_reasoning": 0.7204640507698059,
"adv/std_step_conf": 0.9356883764266968,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6809899198759369,
"calib/avg_num_step_conf": 8.72265625,
"calib/ece": 0.2695238095238095,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6626984126984127,
"calib/gap": 0.25095115016800207,
"calib/mean_conf": 0.744920634920635,
"calib/mu_c": 0.8504794520547946,
"calib/mu_w": 0.5995283018867925,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21753968253968256,
"calib/std_conf": 0.36775972580396354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8915513126491645,
"calib/step_q_c_n": 1257.0,
"calib/step_q_gap": -0.011676146367228801,
"calib/step_q_w": 0.9032274590163933,
"calib/step_q_w_n": 976.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2236.0,
"completions/max_terminated_length": 2236.0,
"completions/mean_length": 759.2265625,
"completions/mean_terminated_length": 768.2293090820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 377.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.12461809068918228,
"kl": 0.1781768798828125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0252,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0193403922021389,
"mask/share_reasoning": 0.8507750034332275,
"mask/share_step_conf": 0.11816590279340744,
"num_tokens": 51265430.0,
"reward": 0.6641005277633667,
"reward_std": 0.30344459414482117,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7047703266143799,
"rewards/format_reward_step": 0.984375,
"rewards/step_correlation_reward": 0.31171196699142456,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5729464292526245,
"adv/mean_abs_reasoning": 0.5114200711250305,
"adv/mean_abs_step_conf": 0.7788619995117188,
"adv/ratio_final_to_reasoning": 1.1203049344392122,
"adv/ratio_step_to_reasoning": 1.5229398365190576,
"adv/std_final_conf": 0.7941260933876038,
"adv/std_reasoning": 0.7753347158432007,
"adv/std_step_conf": 0.9359708428382874,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7183445945945945,
"calib/avg_num_step_conf": 9.09375,
"calib/ece": 0.22644354838709668,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6854838709677419,
"calib/gap": 0.3284681081081082,
"calib/mean_conf": 0.7474112903225807,
"calib/mu_c": 0.8798581081081083,
"calib/mu_w": 0.55139,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18854032258064507,
"calib/std_conf": 0.37516738225922347,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8987419871794873,
"calib/step_q_c_n": 1248.0,
"calib/step_q_gap": 0.0009179131054131151,
"calib/step_q_w": 0.8978240740740742,
"calib/step_q_w_n": 1080.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2992.0,
"completions/max_terminated_length": 2992.0,
"completions/mean_length": 784.21484375,
"completions/mean_terminated_length": 793.5138549804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.02952180802822113,
"kl": 0.159332275390625,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0433,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018694642931222916,
"mask/share_reasoning": 0.8499318361282349,
"mask/share_step_conf": 0.1196548119187355,
"num_tokens": 51570341.0,
"reward": 0.676643431186676,
"reward_std": 0.3154790997505188,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7304421663284302,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.31346967816352844,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.6785448789596558,
"adv/mean_abs_reasoning": 0.5094852447509766,
"adv/mean_abs_step_conf": 0.7583144307136536,
"adv/ratio_final_to_reasoning": 1.331824397174272,
"adv/ratio_step_to_reasoning": 1.488393311732312,
"adv/std_final_conf": 0.8765658736228943,
"adv/std_reasoning": 0.7927893400192261,
"adv/std_step_conf": 0.9364736676216125,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6562177169421488,
"calib/avg_num_step_conf": 8.03125,
"calib/ece": 0.32253012048192764,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6345381526104418,
"calib/gap": 0.2068220557851238,
"calib/mean_conf": 0.711285140562249,
"calib/mu_c": 0.8176033057851239,
"calib/mu_w": 0.61078125,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2739357429718875,
"calib/std_conf": 0.3855314422871999,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.893192118226601,
"calib/step_q_c_n": 1015.0,
"calib/step_q_gap": 0.04358597029192279,
"calib/step_q_w": 0.8496061479346783,
"calib/step_q_w_n": 1041.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 748.3671875,
"completions/mean_terminated_length": 757.2411499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.1824,
"grad_norm": 0.047669488936662674,
"kl": 0.16168212890625,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0245,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020132333040237427,
"mask/share_reasoning": 0.8459810018539429,
"mask/share_step_conf": 0.12216789275407791,
"num_tokens": 51868819.0,
"reward": 0.5441136360168457,
"reward_std": 0.3625720143318176,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6362277269363403,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.1629369854927063,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.60759437084198,
"adv/mean_abs_reasoning": 0.531531572341919,
"adv/mean_abs_step_conf": 0.7559362649917603,
"adv/ratio_final_to_reasoning": 1.1431011861909344,
"adv/ratio_step_to_reasoning": 1.4221850673161673,
"adv/std_final_conf": 0.8296145796775818,
"adv/std_reasoning": 0.7928065061569214,
"adv/std_step_conf": 0.9353254437446594,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6312409812409812,
"calib/avg_num_step_conf": 8.15625,
"calib/ece": 0.28329317269076304,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7751004016064257,
"calib/gap": 0.14640909090909093,
"calib/mean_conf": 0.8395180722891568,
"calib/mu_c": 0.888909090909091,
"calib/mu_w": 0.7425,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23008032128514053,
"calib/std_conf": 0.3036547273606863,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8989647577092512,
"calib/step_q_c_n": 1362.0,
"calib/step_q_gap": 0.014942719141758043,
"calib/step_q_w": 0.8840220385674932,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 740.01171875,
"completions/mean_terminated_length": 748.78662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.036135055124759674,
"kl": 0.1663055419921875,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0072,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02015288546681404,
"mask/share_reasoning": 0.8440628051757812,
"mask/share_step_conf": 0.12406553328037262,
"num_tokens": 52161614.0,
"reward": 0.7183195352554321,
"reward_std": 0.36564695835113525,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6987804174423218,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.4144209623336792,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.6681346893310547,
"adv/mean_abs_reasoning": 0.5042810440063477,
"adv/mean_abs_step_conf": 0.7563927173614502,
"adv/ratio_final_to_reasoning": 1.3249252520438672,
"adv/ratio_step_to_reasoning": 1.4999427925193418,
"adv/std_final_conf": 0.8785695433616638,
"adv/std_reasoning": 0.7753050923347473,
"adv/std_step_conf": 0.936163604259491,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6136217605482972,
"calib/avg_num_step_conf": 9.46875,
"calib/ece": 0.26256854838709687,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7782258064516129,
"calib/gap": 0.21532512315270913,
"calib/mean_conf": 0.822891129032258,
"calib/mu_c": 0.8984285714285714,
"calib/mu_w": 0.6831034482758622,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2181330645161291,
"calib/std_conf": 0.3315241565450681,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8894685314685316,
"calib/step_q_c_n": 1430.0,
"calib/step_q_gap": -0.00946507014112652,
"calib/step_q_w": 0.8989336016096581,
"calib/step_q_w_n": 994.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2563.0,
"completions/max_terminated_length": 2563.0,
"completions/mean_length": 767.86328125,
"completions/mean_terminated_length": 773.909423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.03230111673474312,
"kl": 0.1728515625,
"learning_rate": 7.5e-07,
"loss": -0.0651,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01997010037302971,
"mask/share_reasoning": 0.8442978858947754,
"mask/share_step_conf": 0.1279194951057434,
"num_tokens": 52461347.0,
"reward": 0.6749608516693115,
"reward_std": 0.3358026146888733,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7034523487091064,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.32850050926208496,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7172695398330688,
"adv/mean_abs_reasoning": 0.5680016279220581,
"adv/mean_abs_step_conf": 0.8021489381790161,
"adv/ratio_final_to_reasoning": 1.262794866375794,
"adv/ratio_step_to_reasoning": 1.412229998553962,
"adv/std_final_conf": 0.8888024091720581,
"adv/std_reasoning": 0.793002188205719,
"adv/std_step_conf": 0.9364355802536011,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6110557106767718,
"calib/avg_num_step_conf": 8.75390625,
"calib/ece": 0.3080566801619432,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6720647773279352,
"calib/gap": 0.1802566148118605,
"calib/mean_conf": 0.7419028340080971,
"calib/mu_c": 0.8214492753623189,
"calib/mu_w": 0.6411926605504584,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24562753036437238,
"calib/std_conf": 0.3722543516195086,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8913099041533546,
"calib/step_q_c_n": 1252.0,
"calib/step_q_gap": 0.01091556643849112,
"calib/step_q_w": 0.8803943377148635,
"calib/step_q_w_n": 989.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2730.0,
"completions/max_terminated_length": 2730.0,
"completions/mean_length": 818.765625,
"completions/mean_terminated_length": 831.761962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.1856,
"grad_norm": 0.02354642190039158,
"kl": 0.1643524169921875,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0409,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0181894451379776,
"mask/share_reasoning": 0.848423182964325,
"mask/share_step_conf": 0.11776234954595566,
"num_tokens": 52775183.0,
"reward": 0.5617752075195312,
"reward_std": 0.4311276376247406,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6466355323791504,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.17613360285758972,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6367019414901733,
"adv/mean_abs_reasoning": 0.5002825856208801,
"adv/mean_abs_step_conf": 0.7642124891281128,
"adv/ratio_final_to_reasoning": 1.272684598245587,
"adv/ratio_step_to_reasoning": 1.5275616443448259,
"adv/std_final_conf": 0.8446736335754395,
"adv/std_reasoning": 0.7754291892051697,
"adv/std_step_conf": 0.9358320236206055,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6698145604395604,
"calib/avg_num_step_conf": 8.6171875,
"calib/ece": 0.3659795081967213,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6434426229508197,
"calib/gap": 0.22436950549450552,
"calib/mean_conf": 0.736061475409836,
"calib/mu_c": 0.864798076923077,
"calib/mu_w": 0.6404285714285715,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.337905737704918,
"calib/std_conf": 0.37361490692930116,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8871815144766148,
"calib/step_q_c_n": 898.0,
"calib/step_q_gap": 0.05556071936958107,
"calib/step_q_w": 0.8316207951070337,
"calib/step_q_w_n": 1308.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 799.3125,
"completions/mean_terminated_length": 818.4960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.0368288978934288,
"kl": 0.1604461669921875,
"learning_rate": 6.944444444444446e-07,
"loss": -0.045,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.019019050523638725,
"mask/share_reasoning": 0.8398895263671875,
"mask/share_step_conf": 0.11765392124652863,
"num_tokens": 53085631.0,
"reward": 0.4227388799190521,
"reward_std": 0.3410660922527313,
"rewards/accuracy_reward_step": 0.40625,
"rewards/final_brier_reward_step": 0.5925886631011963,
"rewards/format_reward_step": 0.94140625,
"rewards/step_correlation_reward": -0.016642220318317413,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6359574198722839,
"adv/mean_abs_reasoning": 0.6153806447982788,
"adv/mean_abs_step_conf": 0.7664022445678711,
"adv/ratio_final_to_reasoning": 1.0334374752406297,
"adv/ratio_step_to_reasoning": 1.2454116830715354,
"adv/std_final_conf": 0.858352780342102,
"adv/std_reasoning": 0.8266722559928894,
"adv/std_step_conf": 0.9365108609199524,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6957075788061703,
"calib/avg_num_step_conf": 8.453125,
"calib/ece": 0.2355263157894737,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7206477732793523,
"calib/gap": 0.32593259557344045,
"calib/mean_conf": 0.7736639676113359,
"calib/mu_c": 0.9122183098591548,
"calib/mu_w": 0.5862857142857143,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21714574898785427,
"calib/std_conf": 0.36504289088194597,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8983413848631242,
"calib/step_q_c_n": 1242.0,
"calib/step_q_gap": 0.04184463865921961,
"calib/step_q_w": 0.8564967462039046,
"calib/step_q_w_n": 922.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 771.73046875,
"completions/mean_terminated_length": 780.8814697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.025098109617829323,
"kl": 0.1585693359375,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0358,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019676584750413895,
"mask/share_reasoning": 0.8429697155952454,
"mask/share_step_conf": 0.12563498318195343,
"num_tokens": 53387258.0,
"reward": 0.6907268762588501,
"reward_std": 0.4108719229698181,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7160639762878418,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.36148351430892944,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.6260363459587097,
"adv/mean_abs_reasoning": 0.42240333557128906,
"adv/mean_abs_step_conf": 0.7517305612564087,
"adv/ratio_final_to_reasoning": 1.4820819184867764,
"adv/ratio_step_to_reasoning": 1.779651101096807,
"adv/std_final_conf": 0.8498795032501221,
"adv/std_reasoning": 0.7014126181602478,
"adv/std_step_conf": 0.9360769391059875,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7296880131362891,
"calib/avg_num_step_conf": 8.5546875,
"calib/ece": 0.21883999999999992,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.712,
"calib/gap": 0.35711986863710987,
"calib/mean_conf": 0.78132,
"calib/mu_c": 0.9313103448275861,
"calib/mu_w": 0.5741904761904763,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21007999999999993,
"calib/std_conf": 0.35204240880893883,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9011712439418418,
"calib/step_q_c_n": 1238.0,
"calib/step_q_gap": 0.037274185118312264,
"calib/step_q_w": 0.8638970588235295,
"calib/step_q_w_n": 952.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 751.49609375,
"completions/mean_terminated_length": 763.4246215820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.1888,
"grad_norm": 0.03494914993643761,
"kl": 0.1652984619140625,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0653,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020012080669403076,
"mask/share_reasoning": 0.8428300619125366,
"mask/share_step_conf": 0.1215328723192215,
"num_tokens": 53683473.0,
"reward": 0.6597840785980225,
"reward_std": 0.35069000720977783,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7479737997055054,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.2630005180835724,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6410762667655945,
"adv/mean_abs_reasoning": 0.5759428143501282,
"adv/mean_abs_step_conf": 0.756990909576416,
"adv/ratio_final_to_reasoning": 1.1130901381050484,
"adv/ratio_step_to_reasoning": 1.314350818719,
"adv/std_final_conf": 0.860005259513855,
"adv/std_reasoning": 0.8267216086387634,
"adv/std_step_conf": 0.9359890222549438,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7718120805369126,
"calib/avg_num_step_conf": 8.94921875,
"calib/ece": 0.19368852459016384,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7377049180327869,
"calib/gap": 0.3920593429883431,
"calib/mean_conf": 0.788360655737705,
"calib/mu_c": 0.9410067114093958,
"calib/mu_w": 0.5489473684210527,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1856967213114753,
"calib/std_conf": 0.35428491973786913,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9087252573238321,
"calib/step_q_c_n": 1263.0,
"calib/step_q_gap": 0.028744712576750264,
"calib/step_q_w": 0.8799805447470819,
"calib/step_q_w_n": 1028.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2829.0,
"completions/max_terminated_length": 2829.0,
"completions/mean_length": 737.47265625,
"completions/mean_terminated_length": 758.2047729492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.029399629682302475,
"kl": 0.16845703125,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0332,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01988956332206726,
"mask/share_reasoning": 0.8306618928909302,
"mask/share_step_conf": 0.12210480123758316,
"num_tokens": 53978338.0,
"reward": 0.6872397065162659,
"reward_std": 0.40832090377807617,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7544710636138916,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.31297701597213745,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.6690976023674011,
"adv/mean_abs_reasoning": 0.5811420679092407,
"adv/mean_abs_step_conf": 0.7728196978569031,
"adv/ratio_final_to_reasoning": 1.151349453627743,
"adv/ratio_step_to_reasoning": 1.329829211361786,
"adv/std_final_conf": 0.8573887348175049,
"adv/std_reasoning": 0.8100538849830627,
"adv/std_step_conf": 0.9364103078842163,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6923720541068191,
"calib/avg_num_step_conf": 8.75390625,
"calib/ece": 0.25281893004115213,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7078189300411523,
"calib/gap": 0.28196520708408834,
"calib/mean_conf": 0.7846707818930042,
"calib/mu_c": 0.9018661971830984,
"calib/mu_w": 0.61990099009901,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22656378600823035,
"calib/std_conf": 0.3496295727158898,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9004632587859426,
"calib/step_q_c_n": 1252.0,
"calib/step_q_gap": 0.0475006703127373,
"calib/step_q_w": 0.8529625884732053,
"calib/step_q_w_n": 989.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 784.828125,
"completions/mean_terminated_length": 803.6640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.023182764649391174,
"kl": 0.1683807373046875,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0486,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.018749622628092766,
"mask/share_reasoning": 0.8373773097991943,
"mask/share_step_conf": 0.12043555825948715,
"num_tokens": 54285518.0,
"reward": 0.6336249113082886,
"reward_std": 0.3934309780597687,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6945639252662659,
"rewards/format_reward_step": 0.94921875,
"rewards/step_correlation_reward": 0.2719046473503113,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.5988168716430664,
"adv/mean_abs_reasoning": 0.47128474712371826,
"adv/mean_abs_step_conf": 0.7732551097869873,
"adv/ratio_final_to_reasoning": 1.2706052451255532,
"adv/ratio_step_to_reasoning": 1.6407386712729704,
"adv/std_final_conf": 0.8279719948768616,
"adv/std_reasoning": 0.7206318974494934,
"adv/std_step_conf": 0.9363507628440857,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6941620879120879,
"calib/avg_num_step_conf": 8.71875,
"calib/ece": 0.21770916334661342,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7330677290836654,
"calib/gap": 0.2895188873626372,
"calib/mean_conf": 0.807191235059761,
"calib/mu_c": 0.91215625,
"calib/mu_w": 0.6226373626373628,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1937250996015935,
"calib/std_conf": 0.32948677234491136,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8736913767019667,
"calib/step_q_c_n": 1322.0,
"calib/step_q_gap": 0.027339728350318282,
"calib/step_q_w": 0.8463516483516484,
"calib/step_q_w_n": 910.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2885.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 817.60546875,
"completions/mean_terminated_length": 820.8118286132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 409.0,
"epoch": 0.192,
"grad_norm": 0.03066716343164444,
"kl": 0.1658477783203125,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0516,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018639039248228073,
"mask/share_reasoning": 0.8629419207572937,
"mask/share_step_conf": 0.11451278626918793,
"num_tokens": 54598681.0,
"reward": 0.7222050428390503,
"reward_std": 0.339335173368454,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.750390887260437,
"rewards/format_reward_step": 0.98046875,
"rewards/step_correlation_reward": 0.3729255199432373,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6700544357299805,
"adv/mean_abs_reasoning": 0.5483631491661072,
"adv/mean_abs_step_conf": 0.8035197257995605,
"adv/ratio_final_to_reasoning": 1.2219173311498568,
"adv/ratio_step_to_reasoning": 1.4653058416151206,
"adv/std_final_conf": 0.8462985754013062,
"adv/std_reasoning": 0.7755341529846191,
"adv/std_step_conf": 0.9353812336921692,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7128220140515222,
"calib/avg_num_step_conf": 8.2109375,
"calib/ece": 0.2820362903225807,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6612903225806451,
"calib/gap": 0.3125117096018738,
"calib/mean_conf": 0.7534072580645162,
"calib/mu_c": 0.9071428571428573,
"calib/mu_w": 0.5946311475409835,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2636895161290323,
"calib/std_conf": 0.3576950509350801,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9033270142180094,
"calib/step_q_c_n": 1055.0,
"calib/step_q_gap": 0.02008919186843927,
"calib/step_q_w": 0.8832378223495702,
"calib/step_q_w_n": 1047.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 729.48046875,
"completions/mean_terminated_length": 746.988037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.043537769466638565,
"kl": 0.1852874755859375,
"learning_rate": 5.277777777777779e-07,
"loss": -0.1066,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01987295411527157,
"mask/share_reasoning": 0.8370122909545898,
"mask/share_step_conf": 0.11967720091342926,
"num_tokens": 54891692.0,
"reward": 0.5772460103034973,
"reward_std": 0.40024036169052124,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6956995129585266,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.16660496592521667,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.6767345666885376,
"adv/mean_abs_reasoning": 0.4965467154979706,
"adv/mean_abs_step_conf": 0.7697621583938599,
"adv/ratio_final_to_reasoning": 1.3628819717594194,
"adv/ratio_step_to_reasoning": 1.5502310948163063,
"adv/std_final_conf": 0.8484857082366943,
"adv/std_reasoning": 0.7393876314163208,
"adv/std_step_conf": 0.9362363815307617,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6834075015893197,
"calib/avg_num_step_conf": 8.13671875,
"calib/ece": 0.27547430830039515,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7391304347826086,
"calib/gap": 0.2484335664335665,
"calib/mean_conf": 0.8079644268774704,
"calib/mu_c": 0.9159790209790208,
"calib/mu_w": 0.6675454545454543,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25911067193675874,
"calib/std_conf": 0.33439330160378195,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8946246764452115,
"calib/step_q_c_n": 1159.0,
"calib/step_q_gap": 0.011410390730925646,
"calib/step_q_w": 0.8832142857142858,
"calib/step_q_w_n": 924.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2953.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 766.30078125,
"completions/mean_terminated_length": 769.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.029424257576465607,
"kl": 0.160125732421875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0336,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01990341767668724,
"mask/share_reasoning": 0.8551667332649231,
"mask/share_step_conf": 0.12102358043193817,
"num_tokens": 55194025.0,
"reward": 0.5911649465560913,
"reward_std": 0.4348578155040741,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6973428726196289,
"rewards/format_reward_step": 0.98828125,
"rewards/step_correlation_reward": 0.17561198770999908,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6397321224212646,
"adv/mean_abs_reasoning": 0.5764783620834351,
"adv/mean_abs_step_conf": 0.7763506174087524,
"adv/ratio_final_to_reasoning": 1.1097244311290815,
"adv/ratio_step_to_reasoning": 1.3467125021014916,
"adv/std_final_conf": 0.8472661375999451,
"adv/std_reasoning": 0.8100085854530334,
"adv/std_step_conf": 0.9365969896316528,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6715827832783278,
"calib/avg_num_step_conf": 8.19921875,
"calib/ece": 0.24405714285714283,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6448979591836734,
"calib/gap": 0.26941996699669957,
"calib/mean_conf": 0.7220163265306123,
"calib/mu_c": 0.8330833333333333,
"calib/mu_w": 0.5636633663366337,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18915918367346934,
"calib/std_conf": 0.3782347286243372,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.888424599831508,
"calib/step_q_c_n": 1187.0,
"calib/step_q_gap": 0.015278766498174612,
"calib/step_q_w": 0.8731458333333334,
"calib/step_q_w_n": 912.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 789.6171875,
"completions/mean_terminated_length": 805.3466186523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.1952,
"grad_norm": 0.025317171588540077,
"kl": 0.1699066162109375,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0435,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019160984084010124,
"mask/share_reasoning": 0.847537636756897,
"mask/share_step_conf": 0.11377011984586716,
"num_tokens": 55502847.0,
"reward": 0.6211838126182556,
"reward_std": 0.3905571401119232,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6959280371665955,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.24253329634666443,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.6272495985031128,
"adv/mean_abs_reasoning": 0.5777990818023682,
"adv/mean_abs_step_conf": 0.7672903537750244,
"adv/ratio_final_to_reasoning": 1.085584277057849,
"adv/ratio_step_to_reasoning": 1.3279535706106753,
"adv/std_final_conf": 0.8438796401023865,
"adv/std_reasoning": 0.809932291507721,
"adv/std_step_conf": 0.936083197593689,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7013080399945213,
"calib/avg_num_step_conf": 8.265625,
"calib/ece": 0.2151336032388664,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.708502024291498,
"calib/gap": 0.31017607177099005,
"calib/mean_conf": 0.7942429149797571,
"calib/mu_c": 0.9173087248322146,
"calib/mu_w": 0.6071326530612245,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20306882591093117,
"calib/std_conf": 0.33521408607401043,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8948870292887028,
"calib/step_q_c_n": 1195.0,
"calib/step_q_gap": 0.010598212784902472,
"calib/step_q_w": 0.8842888165038003,
"calib/step_q_w_n": 921.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 781.8359375,
"completions/mean_terminated_length": 800.6000366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.03133862093091011,
"kl": 0.1655120849609375,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0385,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01901405304670334,
"mask/share_reasoning": 0.8426245450973511,
"mask/share_step_conf": 0.11492390930652618,
"num_tokens": 55808277.0,
"reward": 0.6738928556442261,
"reward_std": 0.37431710958480835,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7334769368171692,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.305714875459671,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.5357584953308105,
"adv/mean_abs_reasoning": 0.3198157846927643,
"adv/mean_abs_step_conf": 0.7774848937988281,
"adv/ratio_final_to_reasoning": 1.675209670609269,
"adv/ratio_step_to_reasoning": 2.4310397766818492,
"adv/std_final_conf": 0.7776016592979431,
"adv/std_reasoning": 0.5961856842041016,
"adv/std_step_conf": 0.9354940056800842,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7680244534520566,
"calib/avg_num_step_conf": 8.53515625,
"calib/ece": 0.1909039999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.72,
"calib/gap": 0.4208968037743369,
"calib/mean_conf": 0.786904,
"calib/mu_c": 0.9569463087248321,
"calib/mu_w": 0.5360495049504952,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1909039999999999,
"calib/std_conf": 0.3492250832686564,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8939076923076923,
"calib/step_q_c_n": 1300.0,
"calib/step_q_gap": 0.049399217731420975,
"calib/step_q_w": 0.8445084745762713,
"calib/step_q_w_n": 885.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2739.0,
"completions/max_terminated_length": 2739.0,
"completions/mean_length": 752.76953125,
"completions/mean_terminated_length": 761.6956787109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 354.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.041517455130815506,
"kl": 0.159759521484375,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0224,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01972859725356102,
"mask/share_reasoning": 0.8467744588851929,
"mask/share_step_conf": 0.12177818268537521,
"num_tokens": 56107906.0,
"reward": 0.6698368787765503,
"reward_std": 0.28276172280311584,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7846717834472656,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.243283212184906,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.6632397770881653,
"adv/mean_abs_reasoning": 0.49668216705322266,
"adv/mean_abs_step_conf": 0.7803338170051575,
"adv/ratio_final_to_reasoning": 1.3353404271047544,
"adv/ratio_step_to_reasoning": 1.5710928814594218,
"adv/std_final_conf": 0.8465085029602051,
"adv/std_reasoning": 0.7393675446510315,
"adv/std_step_conf": 0.9363483786582947,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6806158437330441,
"calib/avg_num_step_conf": 8.55859375,
"calib/ece": 0.2633654618473895,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.2574725990233314,
"calib/mean_conf": 0.7321204819277108,
"calib/mu_c": 0.8324210526315788,
"calib/mu_w": 0.5749484536082474,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19252208835341364,
"calib/std_conf": 0.38435834176155914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8957460197119029,
"calib/step_q_c_n": 1319.0,
"calib/step_q_gap": 0.057707028886214706,
"calib/step_q_w": 0.8380389908256882,
"calib/step_q_w_n": 872.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 744.859375,
"completions/mean_terminated_length": 759.6972045898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.1984,
"grad_norm": 0.03142472729086876,
"kl": 0.168121337890625,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0179,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019757648929953575,
"mask/share_reasoning": 0.8379254341125488,
"mask/share_step_conf": 0.12278568744659424,
"num_tokens": 56403630.0,
"reward": 0.6432186365127563,
"reward_std": 0.3723026514053345,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.702370285987854,
"rewards/format_reward_step": 0.97265625,
"rewards/step_correlation_reward": 0.2707856297492981,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.5947656035423279,
"adv/mean_abs_reasoning": 0.4806092381477356,
"adv/mean_abs_step_conf": 0.766196072101593,
"adv/ratio_final_to_reasoning": 1.2375242844572651,
"adv/ratio_step_to_reasoning": 1.5942183613750476,
"adv/std_final_conf": 0.8132778406143188,
"adv/std_reasoning": 0.7207170128822327,
"adv/std_step_conf": 0.9363957643508911,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6926406926406926,
"calib/avg_num_step_conf": 8.890625,
"calib/ece": 0.25153225806451607,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6129032258064516,
"calib/gap": 0.26433832833832815,
"calib/mean_conf": 0.714516129032258,
"calib/mu_c": 0.8264335664335662,
"calib/mu_w": 0.5620952380952381,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1947177419354838,
"calib/std_conf": 0.37622992892282464,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8784045124899276,
"calib/step_q_c_n": 1241.0,
"calib/step_q_gap": -0.001010946447270511,
"calib/step_q_w": 0.8794154589371981,
"calib/step_q_w_n": 1035.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 803.65234375,
"completions/mean_terminated_length": 816.4087524414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.04483392834663391,
"kl": 0.16119384765625,
"learning_rate": 3.611111111111111e-07,
"loss": 0.01,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018417365849018097,
"mask/share_reasoning": 0.8510687351226807,
"mask/share_step_conf": 0.11488892883062363,
"num_tokens": 56710909.0,
"reward": 0.641819953918457,
"reward_std": 0.36996394395828247,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7017328143119812,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.2764383852481842,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.7041361331939697,
"adv/mean_abs_reasoning": 0.5235980749130249,
"adv/mean_abs_step_conf": 0.7775394916534424,
"adv/ratio_final_to_reasoning": 1.3448027541180974,
"adv/ratio_step_to_reasoning": 1.4849930297826244,
"adv/std_final_conf": 0.8908984661102295,
"adv/std_reasoning": 0.7754470109939575,
"adv/std_step_conf": 0.9361128211021423,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6960666265302027,
"calib/avg_num_step_conf": 8.23828125,
"calib/ece": 0.23476000000000008,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.664,
"calib/gap": 0.2933968827346308,
"calib/mean_conf": 0.75004,
"calib/mu_c": 0.8662251655629136,
"calib/mu_w": 0.5728282828282828,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19040000000000007,
"calib/std_conf": 0.36654112784242915,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8885902720527616,
"calib/step_q_c_n": 1213.0,
"calib/step_q_gap": 0.031045629195618818,
"calib/step_q_w": 0.8575446428571428,
"calib/step_q_w_n": 896.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2965.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 778.60546875,
"completions/mean_terminated_length": 790.96435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.051865581423044205,
"kl": 0.163238525390625,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0086,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019419366493821144,
"mask/share_reasoning": 0.8485928177833557,
"mask/share_step_conf": 0.11636281758546829,
"num_tokens": 57014304.0,
"reward": 0.6489625573158264,
"reward_std": 0.33630305528640747,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7240316867828369,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.2621746361255646,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.5933226943016052,
"adv/mean_abs_reasoning": 0.4214906394481659,
"adv/mean_abs_step_conf": 0.7758980989456177,
"adv/ratio_final_to_reasoning": 1.4076770366203373,
"adv/ratio_step_to_reasoning": 1.8408430136466556,
"adv/std_final_conf": 0.7960253953933716,
"adv/std_reasoning": 0.7015754580497742,
"adv/std_step_conf": 0.9360262751579285,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7496066933638443,
"calib/avg_num_step_conf": 8.31640625,
"calib/ece": 0.17793032786885243,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.569672131147541,
"calib/gap": 0.3891518878718536,
"calib/mean_conf": 0.6594877049180328,
"calib/mu_c": 0.806217105263158,
"calib/mu_w": 0.4170652173913044,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10723360655737701,
"calib/std_conf": 0.4012795158599938,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8838832684824902,
"calib/step_q_c_n": 1285.0,
"calib/step_q_gap": 0.07731336326922011,
"calib/step_q_w": 0.8065699052132701,
"calib/step_q_w_n": 844.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1625.0,
"completions/max_terminated_length": 1625.0,
"completions/mean_length": 710.45703125,
"completions/mean_terminated_length": 730.4296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.2016,
"grad_norm": 0.04199478030204773,
"kl": 0.18646240234375,
"learning_rate": 3.055555555555556e-07,
"loss": -0.1269,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.019616516306996346,
"mask/share_reasoning": 0.8344583511352539,
"mask/share_step_conf": 0.11858132481575012,
"num_tokens": 57303949.0,
"reward": 0.6880927085876465,
"reward_std": 0.2997852861881256,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7487436532974243,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.31728553771972656,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6792967319488525,
"adv/mean_abs_reasoning": 0.5358800888061523,
"adv/mean_abs_step_conf": 0.7768614292144775,
"adv/ratio_final_to_reasoning": 1.2676282365000864,
"adv/ratio_step_to_reasoning": 1.4496926559544128,
"adv/std_final_conf": 0.8648825883865356,
"adv/std_reasoning": 0.7754979729652405,
"adv/std_step_conf": 0.9365853667259216,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7335120470253859,
"calib/avg_num_step_conf": 9.109375,
"calib/ece": 0.21571999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.592,
"calib/gap": 0.3768548543375751,
"calib/mean_conf": 0.67572,
"calib/mu_c": 0.8460583941605839,
"calib/mu_w": 0.46920353982300883,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17171999999999998,
"calib/std_conf": 0.4000196015197255,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8913212669683259,
"calib/step_q_c_n": 1105.0,
"calib/step_q_gap": 0.04610529304819555,
"calib/step_q_w": 0.8452159739201304,
"calib/step_q_w_n": 1227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2496.0,
"completions/max_terminated_length": 2496.0,
"completions/mean_length": 802.4140625,
"completions/mean_terminated_length": 815.1508178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.02791653200984001,
"kl": 0.164642333984375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0435,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01869148015975952,
"mask/share_reasoning": 0.851791262626648,
"mask/share_step_conf": 0.11389227211475372,
"num_tokens": 57614975.0,
"reward": 0.6310009360313416,
"reward_std": 0.34945589303970337,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7447917461395264,
"rewards/format_reward_step": 0.9765625,
"rewards/step_correlation_reward": 0.21486634016036987,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6342035531997681,
"adv/mean_abs_reasoning": 0.44904839992523193,
"adv/mean_abs_step_conf": 0.7958725690841675,
"adv/ratio_final_to_reasoning": 1.4123278321565451,
"adv/ratio_step_to_reasoning": 1.7723536465483074,
"adv/std_final_conf": 0.8569350838661194,
"adv/std_reasoning": 0.7015075087547302,
"adv/std_step_conf": 0.9364147782325745,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6329937938478143,
"calib/avg_num_step_conf": 8.7421875,
"calib/ece": 0.29515918367346944,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7020408163265306,
"calib/gap": 0.2214092687533732,
"calib/mean_conf": 0.7685469387755102,
"calib/mu_c": 0.8670514705882354,
"calib/mu_w": 0.6456422018348622,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25430204081632657,
"calib/std_conf": 0.362472511048961,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8939222316145393,
"calib/step_q_c_n": 1183.0,
"calib/step_q_gap": 0.05395066763349665,
"calib/step_q_w": 0.8399715639810427,
"calib/step_q_w_n": 1055.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2650.0,
"completions/max_terminated_length": 2650.0,
"completions/mean_length": 729.98046875,
"completions/mean_terminated_length": 753.5281982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 350.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.02957489714026451,
"kl": 0.1614990234375,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0753,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019972363486886024,
"mask/share_reasoning": 0.8223469257354736,
"mask/share_step_conf": 0.1264307200908661,
"num_tokens": 57906018.0,
"reward": 0.5725820660591125,
"reward_std": 0.3514135479927063,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6559983491897583,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.1915094554424286,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6305051445960999,
"adv/mean_abs_reasoning": 0.48335394263267517,
"adv/mean_abs_step_conf": 0.7706685066223145,
"adv/ratio_final_to_reasoning": 1.3044377814773558,
"adv/ratio_step_to_reasoning": 1.5944185795293782,
"adv/std_final_conf": 0.8294618129730225,
"adv/std_reasoning": 0.7755149006843567,
"adv/std_step_conf": 0.9359073042869568,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.78238244983528,
"calib/avg_num_step_conf": 8.9296875,
"calib/ece": 0.16715637860082297,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6954732510288066,
"calib/gap": 0.40719047619047605,
"calib/mean_conf": 0.7602427983539094,
"calib/mu_c": 0.9009999999999999,
"calib/mu_w": 0.49380952380952386,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1365390946502057,
"calib/std_conf": 0.36608740256453653,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.894045718432511,
"calib/step_q_c_n": 1378.0,
"calib/step_q_gap": 0.08005893429154187,
"calib/step_q_w": 0.8139867841409691,
"calib/step_q_w_n": 908.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 800.26953125,
"completions/mean_terminated_length": 816.211181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.2048,
"grad_norm": 0.04933116212487221,
"kl": 0.1651611328125,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0314,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01982063613831997,
"mask/share_reasoning": 0.8341763615608215,
"mask/share_step_conf": 0.12647175788879395,
"num_tokens": 58215863.0,
"reward": 0.7209115028381348,
"reward_std": 0.3499925136566162,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7715025544166565,
"rewards/format_reward_step": 0.94921875,
"rewards/step_correlation_reward": 0.35625797510147095,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7361017465591431,
"adv/mean_abs_reasoning": 0.6458040475845337,
"adv/mean_abs_step_conf": 0.7798423767089844,
"adv/ratio_final_to_reasoning": 1.1398221322897326,
"adv/ratio_step_to_reasoning": 1.2075526309037348,
"adv/std_final_conf": 0.9041551947593689,
"adv/std_reasoning": 0.8430927991867065,
"adv/std_step_conf": 0.9367273449897766,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7159560723514211,
"calib/avg_num_step_conf": 8.09765625,
"calib/ece": 0.23799196787148594,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5261044176706827,
"calib/gap": 0.31876162790697676,
"calib/mean_conf": 0.6362248995983937,
"calib/mu_c": 0.7898449612403101,
"calib/mu_w": 0.47108333333333335,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1780722891566265,
"calib/std_conf": 0.40659745644660067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8885907335907336,
"calib/step_q_c_n": 1036.0,
"calib/step_q_gap": 0.060432585085429924,
"calib/step_q_w": 0.8281581485053037,
"calib/step_q_w_n": 1037.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3052.0,
"completions/max_terminated_length": 3052.0,
"completions/mean_length": 782.04296875,
"completions/mean_terminated_length": 797.6215209960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.03797115013003349,
"kl": 0.1591644287109375,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0038,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018755977973341942,
"mask/share_reasoning": 0.850509524345398,
"mask/share_step_conf": 0.11120319366455078,
"num_tokens": 58521778.0,
"reward": 0.5878036022186279,
"reward_std": 0.4244157671928406,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.706500768661499,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.1745751053094864,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.5197247266769409,
"adv/mean_abs_reasoning": 0.38382184505462646,
"adv/mean_abs_step_conf": 0.7647292613983154,
"adv/ratio_final_to_reasoning": 1.3540780270153003,
"adv/ratio_step_to_reasoning": 1.9924068190789852,
"adv/std_final_conf": 0.7646445631980896,
"adv/std_reasoning": 0.6614271998405457,
"adv/std_step_conf": 0.9360878467559814,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7661644029428409,
"calib/avg_num_step_conf": 8.5546875,
"calib/ece": 0.16677551020408168,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6530612244897959,
"calib/gap": 0.42447439162422196,
"calib/mean_conf": 0.718938775510204,
"calib/mu_c": 0.8800657894736843,
"calib/mu_w": 0.45559139784946234,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13265306122448983,
"calib/std_conf": 0.3857615001877093,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8965129007036746,
"calib/step_q_c_n": 1279.0,
"calib/step_q_gap": 0.057533756905650546,
"calib/step_q_w": 0.8389791437980241,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 730.33203125,
"completions/mean_terminated_length": 753.89111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 372.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.036260880529880524,
"kl": 0.17108154296875,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019665461033582687,
"mask/share_reasoning": 0.8306401968002319,
"mask/share_step_conf": 0.11844430863857269,
"num_tokens": 58814687.0,
"reward": 0.6782735586166382,
"reward_std": 0.28763946890830994,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7712780833244324,
"rewards/format_reward_step": 0.95703125,
"rewards/step_correlation_reward": 0.27511268854141235,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6894845962524414,
"adv/mean_abs_reasoning": 0.49503880739212036,
"adv/mean_abs_step_conf": 0.7993884086608887,
"adv/ratio_final_to_reasoning": 1.3927889813016225,
"adv/ratio_step_to_reasoning": 1.6147994798066263,
"adv/std_final_conf": 0.8758496642112732,
"adv/std_reasoning": 0.7577955722808838,
"adv/std_step_conf": 0.9363441467285156,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6927521008403362,
"calib/avg_num_step_conf": 8.203125,
"calib/ece": 0.22973140495867767,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5909090909090909,
"calib/gap": 0.29902380952380936,
"calib/mean_conf": 0.6888223140495867,
"calib/mu_c": 0.8148571428571428,
"calib/mu_w": 0.5158333333333335,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17002066115702477,
"calib/std_conf": 0.39343785336538334,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8926034482758621,
"calib/step_q_c_n": 1160.0,
"calib/step_q_gap": 0.024358767424798256,
"calib/step_q_w": 0.8682446808510639,
"calib/step_q_w_n": 940.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 765.34375,
"completions/mean_terminated_length": 790.0322265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.208,
"grad_norm": 0.03882647678256035,
"kl": 0.1652374267578125,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.1117,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018789615482091904,
"mask/share_reasoning": 0.8310298919677734,
"mask/share_step_conf": 0.11893048882484436,
"num_tokens": 59116599.0,
"reward": 0.5893170237541199,
"reward_std": 0.33620980381965637,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.694831132888794,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.18536534905433655,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.5781576633453369,
"adv/mean_abs_reasoning": 0.42664217948913574,
"adv/mean_abs_step_conf": 0.7782294750213623,
"adv/ratio_final_to_reasoning": 1.355134797121154,
"adv/ratio_step_to_reasoning": 1.8240800193577194,
"adv/std_final_conf": 0.794323205947876,
"adv/std_reasoning": 0.7014920711517334,
"adv/std_step_conf": 0.9358806014060974,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6818520778252322,
"calib/avg_num_step_conf": 8.140625,
"calib/ece": 0.24919354838709687,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7459677419354839,
"calib/gap": 0.3051901565995525,
"calib/mean_conf": 0.7866935483870968,
"calib/mu_c": 0.9085234899328859,
"calib/mu_w": 0.6033333333333334,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21754032258064526,
"calib/std_conf": 0.35887530805639195,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8958135731807031,
"calib/step_q_c_n": 1223.0,
"calib/step_q_gap": 0.027497661450157196,
"calib/step_q_w": 0.8683159117305459,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3002.0,
"completions/max_terminated_length": 3002.0,
"completions/mean_length": 700.89453125,
"completions/mean_terminated_length": 706.4133911132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.019181951880455017,
"kl": 0.175933837890625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0149,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02079775743186474,
"mask/share_reasoning": 0.8414372205734253,
"mask/share_step_conf": 0.12995250523090363,
"num_tokens": 59398572.0,
"reward": 0.656751275062561,
"reward_std": 0.3294701874256134,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7199835777282715,
"rewards/format_reward_step": 0.96875,
"rewards/step_correlation_reward": 0.2833626866340637,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.7014039754867554,
"adv/mean_abs_reasoning": 0.5561388731002808,
"adv/mean_abs_step_conf": 0.7772125005722046,
"adv/ratio_final_to_reasoning": 1.2612029286438335,
"adv/ratio_step_to_reasoning": 1.3975151498393112,
"adv/std_final_conf": 0.8776654601097107,
"adv/std_reasoning": 0.7929669618606567,
"adv/std_step_conf": 0.9363115429878235,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7232647296206618,
"calib/avg_num_step_conf": 8.578125,
"calib/ece": 0.2052049180327868,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5286885245901639,
"calib/gap": 0.34411218724778025,
"calib/mean_conf": 0.6516803278688524,
"calib/mu_c": 0.818095238095238,
"calib/mu_w": 0.47398305084745773,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17024590163934417,
"calib/std_conf": 0.391695615480666,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8928584729981378,
"calib/step_q_c_n": 1074.0,
"calib/step_q_gap": 0.03971230544020554,
"calib/step_q_w": 0.8531461675579323,
"calib/step_q_w_n": 1122.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 747.97265625,
"completions/mean_terminated_length": 781.5550537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.021937143057584763,
"kl": 0.1743927001953125,
"learning_rate": 8.333333333333334e-08,
"loss": -0.1275,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.018995754420757294,
"mask/share_reasoning": 0.8196362257003784,
"mask/share_step_conf": 0.11839927732944489,
"num_tokens": 59695109.0,
"reward": 0.6292377710342407,
"reward_std": 0.3564552962779999,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7152363061904907,
"rewards/format_reward_step": 0.953125,
"rewards/step_correlation_reward": 0.2541767358779907,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.5963702201843262,
"adv/mean_abs_reasoning": 0.4490445554256439,
"adv/mean_abs_step_conf": 0.8115639090538025,
"adv/ratio_final_to_reasoning": 1.3280869637068287,
"adv/ratio_step_to_reasoning": 1.8073126580602472,
"adv/std_final_conf": 0.8019877076148987,
"adv/std_reasoning": 0.6818601489067078,
"adv/std_step_conf": 0.9362412691116333,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7639022051773731,
"calib/avg_num_step_conf": 8.68359375,
"calib/ece": 0.17732388663967605,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.611336032388664,
"calib/gap": 0.4143632379126146,
"calib/mean_conf": 0.6918987854251011,
"calib/mu_c": 0.8563020134228186,
"calib/mu_w": 0.44193877551020405,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13299190283400802,
"calib/std_conf": 0.3922373820082409,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.908143708116158,
"calib/step_q_c_n": 1343.0,
"calib/step_q_gap": 0.025518708116157973,
"calib/step_q_w": 0.882625,
"calib/step_q_w_n": 880.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 741.4921875,
"completions/mean_terminated_length": 750.2846069335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.2112,
"grad_norm": 0.0298744086176157,
"kl": 0.18133544921875,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0256,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020268086344003677,
"mask/share_reasoning": 0.8358542919158936,
"mask/share_step_conf": 0.13215884566307068,
"num_tokens": 59990315.0,
"reward": 0.7111806273460388,
"reward_std": 0.30690059065818787,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7692664265632629,
"rewards/format_reward_step": 0.96484375,
"rewards/step_correlation_reward": 0.3437197804450989,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.7013013362884521,
"adv/mean_abs_reasoning": 0.5438744425773621,
"adv/mean_abs_step_conf": 0.7945132851600647,
"adv/ratio_final_to_reasoning": 1.2894544795395442,
"adv/ratio_step_to_reasoning": 1.4608395301587482,
"adv/std_final_conf": 0.8762961626052856,
"adv/std_reasoning": 0.7754784226417542,
"adv/std_step_conf": 0.9364268779754639,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6499397227245329,
"calib/avg_num_step_conf": 8.31640625,
"calib/ece": 0.2786772727272726,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.640495867768595,
"calib/gap": 0.21296984629294757,
"calib/mean_conf": 0.7159500000000001,
"calib/mu_c": 0.7898734177215191,
"calib/mu_w": 0.5769035714285715,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17086735537190073,
"calib/std_conf": 0.38994052632452547,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8802288329519451,
"calib/step_q_c_n": 1311.0,
"calib/step_q_gap": 0.01898188918666388,
"calib/step_q_w": 0.8612469437652812,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2624.0,
"completions/max_terminated_length": 2624.0,
"completions/mean_length": 754.75,
"completions/mean_terminated_length": 785.4308471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.031152119860053062,
"kl": 0.1701507568359375,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0671,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.01972189173102379,
"mask/share_reasoning": 0.8208757638931274,
"mask/share_step_conf": 0.12033981829881668,
"num_tokens": 60287731.0,
"reward": 0.6673556566238403,
"reward_std": 0.3885536789894104,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6748343706130981,
"rewards/format_reward_step": 0.9453125,
"rewards/step_correlation_reward": 0.34737688302993774,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5053526163101196,
"adv/mean_abs_reasoning": 0.37649911642074585,
"adv/mean_abs_step_conf": 0.742881178855896,
"adv/ratio_final_to_reasoning": 1.3422411747319407,
"adv/ratio_step_to_reasoning": 1.9731286116106321,
"adv/std_final_conf": 0.7446314692497253,
"adv/std_reasoning": 0.6612992286682129,
"adv/std_step_conf": 0.9360516667366028,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7355482933914307,
"calib/avg_num_step_conf": 7.91015625,
"calib/ece": 0.19168016194331977,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6720647773279352,
"calib/gap": 0.37423565722585317,
"calib/mean_conf": 0.7230971659919027,
"calib/mu_c": 0.8518827160493827,
"calib/mu_w": 0.47764705882352954,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1294534412955465,
"calib/std_conf": 0.3892157362494419,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9031725697061039,
"calib/step_q_c_n": 1327.0,
"calib/step_q_gap": 0.04088030609578874,
"calib/step_q_w": 0.8622922636103152,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2847.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 737.1328125,
"completions/mean_terminated_length": 754.8240356445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.023639973253011703,
"kl": 0.168212890625,
"learning_rate": 0.0,
"loss": -0.0846,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01986561343073845,
"mask/share_reasoning": 0.8402686715126038,
"mask/share_step_conf": 0.1164281889796257,
"num_tokens": 60584485.0,
"reward": 0.7310245037078857,
"reward_std": 0.28140828013420105,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7594671249389648,
"rewards/format_reward_step": 0.9609375,
"rewards/step_correlation_reward": 0.3830506503582001,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.025999896985013038,
"train_runtime": 20057.5097,
"train_samples_per_second": 2.553,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 60584485,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}