{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.47760647535324097, "adv/mean_abs_reasoning": 0.4569147527217865, "adv/mean_abs_step_conf": 0.7666968107223511, "adv/ratio_final_to_reasoning": 1.0452857398632815, "adv/ratio_step_to_reasoning": 1.677986552535741, "adv/std_final_conf": 0.7227410674095154, "adv/std_reasoning": 0.7206857204437256, "adv/std_step_conf": 0.9354395866394043, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.025545112788677216, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.084, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018737709149718285, "mask/share_reasoning": 0.845859944820404, "mask/share_step_conf": 0.10805858671665192, "num_tokens": 300991.0, "reward": 0.7390083074569702, "reward_std": 0.33157801628112793, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.46317872405052185, "step": 1 }, { "adv/mean_abs_final_conf": 0.437887966632843, "adv/mean_abs_reasoning": 0.4207462966442108, "adv/mean_abs_step_conf": 0.7360875010490417, "adv/ratio_final_to_reasoning": 1.0407411072310102, "adv/ratio_step_to_reasoning": 1.7494806417071997, "adv/std_final_conf": 0.6832791566848755, "adv/std_reasoning": 0.6817297339439392, "adv/std_step_conf": 0.9336206316947937, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.023035092279314995, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": 0.0122, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01788702979683876, "mask/share_reasoning": 0.8706268668174744, "mask/share_step_conf": 0.09976735711097717, "num_tokens": 619483.0, "reward": 0.6397823095321655, "reward_std": 0.3136184811592102, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3405126631259918, "step": 2 }, { "adv/mean_abs_final_conf": 0.4370732605457306, "adv/mean_abs_reasoning": 0.42643219232559204, "adv/mean_abs_step_conf": 0.7442405223846436, "adv/ratio_final_to_reasoning": 1.0249537169370502, "adv/ratio_step_to_reasoning": 1.745272837695135, "adv/std_final_conf": 0.7216770648956299, "adv/std_reasoning": 0.7205691337585449, "adv/std_step_conf": 0.9328770041465759, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4994155891591789, "calib/avg_num_step_conf": 7.53125, "calib/ece": 0.31421999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.9812988530729676e-05, "calib/mean_conf": 0.99022, "calib/mu_c": 0.9902071005917161, "calib/mu_w": 0.9902469135802469, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31421999999999994, "calib/std_conf": 0.0014323407415835114, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9115477145148358, "calib/step_q_c_n": 1247.0, "calib/step_q_gap": -0.00036124290072947485, "calib/step_q_w": 0.9119089574155653, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 793.375, "completions/mean_terminated_length": 809.1793212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.023815318942070007, "kl": 0.0005033016204833984, "learning_rate": 7.5e-07, "loss": -0.0902, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018328379839658737, "mask/share_reasoning": 0.8629274368286133, "mask/share_step_conf": 0.09921293705701828, "num_tokens": 927843.0, "reward": 0.6633919477462769, "reward_std": 0.3260177969932556, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6661549806594849, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3332850933074951, "step": 3 }, { "adv/mean_abs_final_conf": 0.39261171221733093, "adv/mean_abs_reasoning": 0.39031437039375305, "adv/mean_abs_step_conf": 0.7463158369064331, "adv/ratio_final_to_reasoning": 1.0058858755860316, "adv/ratio_step_to_reasoning": 1.9120890582469259, "adv/std_final_conf": 0.6817169785499573, "adv/std_reasoning": 0.6815755367279053, "adv/std_step_conf": 0.9353012442588806, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5027624309392266, "calib/avg_num_step_conf": 7.88671875, "calib/ece": 0.2689243027888446, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.524861878469789e-05, "calib/mean_conf": 0.9900398406374502, "calib/mu_c": 0.9900552486187845, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2689243027888446, "calib/std_conf": 0.0006299357888781637, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102844638949672, "calib/step_q_c_n": 1371.0, "calib/step_q_gap": -0.0030951657346623973, "calib/step_q_w": 0.9133796296296296, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 790.19921875, "completions/mean_terminated_length": 796.4212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.004266666666666667, "grad_norm": 0.02493300288915634, "kl": 0.0005279183387756348, "learning_rate": 1.0000000000000002e-06, "loss": 0.0252, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018840719014406204, "mask/share_reasoning": 0.8683602809906006, "mask/share_step_conf": 0.10498643666505814, "num_tokens": 1236302.0, "reward": 0.7225152254104614, "reward_std": 0.31173262000083923, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.71240234375, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3951280117034912, "step": 4 }, { "adv/mean_abs_final_conf": 0.42183569073677063, "adv/mean_abs_reasoning": 0.39110061526298523, "adv/mean_abs_step_conf": 0.7385746836662292, "adv/ratio_final_to_reasoning": 1.078586108725803, "adv/ratio_step_to_reasoning": 1.8884518582758922, "adv/std_final_conf": 0.6998975276947021, "adv/std_reasoning": 0.6815629005432129, "adv/std_step_conf": 0.9349147081375122, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4991792929292929, "calib/avg_num_step_conf": 7.78515625, "calib/ece": 0.46246031746031757, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.008151515151515132, "calib/mean_conf": 0.9862698412698413, "calib/mu_c": 0.9901515151515151, "calib/mu_w": 0.982, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46246031746031757, "calib/std_conf": 0.06226841701149678, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9147625354777673, "calib/step_q_c_n": 1057.0, "calib/step_q_gap": 0.01945270641793828, "calib/step_q_w": 0.895309829059829, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 832.88671875, "completions/mean_terminated_length": 839.4448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.005333333333333333, "grad_norm": 0.037833891808986664, "kl": 0.0005629062652587891, "learning_rate": 1.25e-06, "loss": 0.0065, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018345676362514496, "mask/share_reasoning": 0.8695776462554932, "mask/share_step_conf": 0.10426412522792816, "num_tokens": 1556209.0, "reward": 0.44701236486434937, "reward_std": 0.3320618271827698, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.528497576713562, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.06552702188491821, "step": 5 }, { "adv/mean_abs_final_conf": 0.25368258357048035, "adv/mean_abs_reasoning": 0.24287119507789612, "adv/mean_abs_step_conf": 0.7538926601409912, "adv/ratio_final_to_reasoning": 1.0445149063029755, "adv/ratio_step_to_reasoning": 3.1040842859079896, "adv/std_final_conf": 0.5490252375602722, "adv/std_reasoning": 0.5482149720191956, "adv/std_step_conf": 0.934939980506897, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5114931610942249, "calib/avg_num_step_conf": 8.203125, "calib/ece": 0.43260869565217386, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004989868287736954, "calib/mean_conf": 0.9899209486166007, "calib/mu_c": 0.9901418439716309, "calib/mu_w": 0.9896428571428572, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43260869565217386, "calib/std_conf": 0.002666153668622935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.912532637075718, "calib/step_q_c_n": 1149.0, "calib/step_q_gap": 0.00033495043008180403, "calib/step_q_w": 0.9121976866456362, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 734.64453125, "completions/mean_terminated_length": 743.3557739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.0064, "grad_norm": 0.02843756601214409, "kl": 0.0022296905517578125, "learning_rate": 1.5e-06, "loss": -0.0273, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019643045961856842, "mask/share_reasoning": 0.8531935214996338, "mask/share_step_conf": 0.11544472724199295, "num_tokens": 1850230.0, "reward": 0.516380786895752, "reward_std": 0.23685571551322937, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5597362518310547, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1652126908302307, "step": 6 }, { "adv/mean_abs_final_conf": 0.4957495331764221, "adv/mean_abs_reasoning": 0.46634334325790405, "adv/mean_abs_step_conf": 0.7799911499023438, "adv/ratio_final_to_reasoning": 1.0630569522298412, "adv/ratio_step_to_reasoning": 1.6725684223415227, "adv/std_final_conf": 0.7390549778938293, "adv/std_reasoning": 0.7207476496696472, "adv/std_step_conf": 0.9354566335678101, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49412020905923343, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.32886693548387114, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00012224157955864623, "calib/mean_conf": 0.9901572580645163, "calib/mu_c": 0.9901158536585364, "calib/mu_w": 0.990238095238095, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32886693548387114, "calib/std_conf": 0.0012294552548691184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9107335473515248, "calib/step_q_c_n": 1246.0, "calib/step_q_gap": 0.004154986411730399, "calib/step_q_w": 0.9065785609397944, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 839.84765625, "completions/mean_terminated_length": 849.8063354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.007466666666666667, "grad_norm": 0.03482392057776451, "kl": 0.000522613525390625, "learning_rate": 1.75e-06, "loss": -0.0187, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017592590302228928, "mask/share_reasoning": 0.8731827139854431, "mask/share_step_conf": 0.09750597178936005, "num_tokens": 2172655.0, "reward": 0.6363974809646606, "reward_std": 0.3690088987350464, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6468581557273865, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.304843008518219, "step": 7 }, { "adv/mean_abs_final_conf": 0.42081981897354126, "adv/mean_abs_reasoning": 0.404765248298645, "adv/mean_abs_step_conf": 0.7461205124855042, "adv/ratio_final_to_reasoning": 1.0396639057883024, "adv/ratio_step_to_reasoning": 1.843341333332548, "adv/std_final_conf": 0.7008998990058899, "adv/std_reasoning": 0.701344907283783, "adv/std_step_conf": 0.9360982775688171, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5083723137414412, "calib/avg_num_step_conf": 7.7265625, "calib/ece": 0.3893951612903227, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016744627482878638, "calib/mean_conf": 0.990201612903226, "calib/mu_c": 0.9902684563758388, "calib/mu_w": 0.99010101010101, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3893951612903227, "calib/std_conf": 0.0014055181498333381, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9123884514435695, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.013969289766922688, "calib/step_q_w": 0.8984191616766468, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 851.25, "completions/mean_terminated_length": 861.3439331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.008533333333333334, "grad_norm": 0.024893108755350113, "kl": 0.0005701184272766113, "learning_rate": 2.0000000000000003e-06, "loss": 0.0264, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01763007789850235, "mask/share_reasoning": 0.872861385345459, "mask/share_step_conf": 0.09778980910778046, "num_tokens": 2497087.0, "reward": 0.5612791776657104, "reward_std": 0.3177064061164856, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5895148515701294, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.22366851568222046, "step": 8 }, { "adv/mean_abs_final_conf": 0.5682142972946167, "adv/mean_abs_reasoning": 0.5364242792129517, "adv/mean_abs_step_conf": 0.7655105590820312, "adv/ratio_final_to_reasoning": 1.0592628248078326, "adv/ratio_step_to_reasoning": 1.4270617284609075, "adv/std_final_conf": 0.7919481992721558, "adv/std_reasoning": 0.7756170630455017, "adv/std_step_conf": 0.9360333681106567, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.4805194805194805, "calib/avg_num_step_conf": 7.8125, "calib/ece": 0.31365546218487383, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00038961038961016214, "calib/mean_conf": 0.990126050420168, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99038961038961, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31365546218487383, "calib/std_conf": 0.0011156233653236776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9011468970934798, "calib/step_q_c_n": 1273.0, "calib/step_q_gap": -0.007331782411334364, "calib/step_q_w": 0.9084786795048142, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 811.17578125, "completions/mean_terminated_length": 840.7327880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 437.0, "epoch": 0.0096, "grad_norm": 0.018573205918073654, "kl": 0.0006935596466064453, "learning_rate": 2.25e-06, "loss": -0.1026, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01758037693798542, "mask/share_reasoning": 0.8479781150817871, "mask/share_step_conf": 0.09928528964519501, "num_tokens": 2812284.0, "reward": 0.6472536325454712, "reward_std": 0.39420419931411743, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6345956921577454, "rewards/format_reward_step": 0.9296875, "rewards/step_correlation_reward": 0.34819287061691284, "step": 9 }, { "adv/mean_abs_final_conf": 0.5329347848892212, "adv/mean_abs_reasoning": 0.5287434458732605, "adv/mean_abs_step_conf": 0.7661516070365906, "adv/ratio_final_to_reasoning": 1.0079269805586684, "adv/ratio_step_to_reasoning": 1.449004452000785, "adv/std_final_conf": 0.7758122086524963, "adv/std_reasoning": 0.7754489779472351, "adv/std_step_conf": 0.9358128905296326, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4855769230769231, "calib/avg_num_step_conf": 7.62890625, "calib/ece": 0.40777510040160647, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024999999999997247, "calib/mean_conf": 0.9901044176706828, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9902500000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40777510040160647, "calib/std_conf": 0.0009554013445488028, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9091376146788991, "calib/step_q_c_n": 1090.0, "calib/step_q_gap": 0.0028560387808688192, "calib/step_q_w": 0.9062815758980303, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 834.94140625, "completions/mean_terminated_length": 851.57373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.010666666666666666, "grad_norm": 0.023835062980651855, "kl": 0.0006435513496398926, "learning_rate": 2.5e-06, "loss": 0.0107, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01767464354634285, "mask/share_reasoning": 0.8661353588104248, "mask/share_step_conf": 0.09665870666503906, "num_tokens": 3132829.0, "reward": 0.5643854737281799, "reward_std": 0.4170411229133606, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5742319822311401, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.24672642350196838, "step": 10 }, { "adv/mean_abs_final_conf": 0.4380926489830017, "adv/mean_abs_reasoning": 0.37450408935546875, "adv/mean_abs_step_conf": 0.7518080472946167, "adv/ratio_final_to_reasoning": 1.169794032788722, "adv/ratio_step_to_reasoning": 2.007476202966162, "adv/std_final_conf": 0.7370259165763855, "adv/std_reasoning": 0.7013659477233887, "adv/std_step_conf": 0.9356460571289062, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5035284350352844, "calib/avg_num_step_conf": 7.5546875, "calib/ece": 0.39430612244897956, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.770582537709899e-05, "calib/mean_conf": 0.9902244897959184, "calib/mu_c": 0.9902397260273972, "calib/mu_w": 0.9902020202020201, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39430612244897956, "calib/std_conf": 0.001446534297687295, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.912566371681416, "calib/step_q_c_n": 1130.0, "calib/step_q_gap": 0.001235525910271651, "calib/step_q_w": 0.9113308457711443, "calib/step_q_w_n": 804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 819.5546875, "completions/mean_terminated_length": 842.5943603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.011733333333333333, "grad_norm": 0.02992285043001175, "kl": 0.0006269216537475586, "learning_rate": 2.7500000000000004e-06, "loss": -0.0341, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017518922686576843, "mask/share_reasoning": 0.8593878746032715, "mask/share_step_conf": 0.09574948251247406, "num_tokens": 3447115.0, "reward": 0.5951333045959473, "reward_std": 0.29580140113830566, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5777971744537354, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.3070007264614105, "step": 11 }, { "adv/mean_abs_final_conf": 0.42120277881622314, "adv/mean_abs_reasoning": 0.39798927307128906, "adv/mean_abs_step_conf": 0.7613787651062012, "adv/ratio_final_to_reasoning": 1.05832696335204, "adv/ratio_step_to_reasoning": 1.9130635336742372, "adv/std_final_conf": 0.682644784450531, "adv/std_reasoning": 0.6817029714584351, "adv/std_step_conf": 0.9329316020011902, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.511436658506732, "calib/avg_num_step_conf": 7.9453125, "calib/ece": 0.2926572580645159, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.013121481028151805, "calib/mean_conf": 0.9862056451612904, "calib/mu_c": 0.9902267441860464, "calib/mu_w": 0.9771052631578946, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2926572580645159, "calib/std_conf": 0.06276590249904203, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9126993416239942, "calib/step_q_c_n": 1367.0, "calib/step_q_gap": 0.0034339743076523543, "calib/step_q_w": 0.9092653673163419, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 754.33984375, "completions/mean_terminated_length": 778.67333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.0128, "grad_norm": 0.035631634294986725, "kl": 0.0007064938545227051, "learning_rate": 3e-06, "loss": -0.0519, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01888711005449295, "mask/share_reasoning": 0.8392666578292847, "mask/share_step_conf": 0.11059625446796417, "num_tokens": 3744402.0, "reward": 0.7010163068771362, "reward_std": 0.2875725030899048, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6814679503440857, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.392439603805542, "step": 12 }, { "adv/mean_abs_final_conf": 0.485071063041687, "adv/mean_abs_reasoning": 0.46181046962738037, "adv/mean_abs_step_conf": 0.7558674812316895, "adv/ratio_final_to_reasoning": 1.0503682678157444, "adv/ratio_step_to_reasoning": 1.6367482570102276, "adv/std_final_conf": 0.7590910792350769, "adv/std_reasoning": 0.7392831444740295, "adv/std_step_conf": 0.9358668327331543, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5147058823529412, "calib/avg_num_step_conf": 7.84765625, "calib/ece": 0.3279681274900399, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.002177888022678931, "calib/mean_conf": 0.9893227091633466, "calib/mu_c": 0.9900602409638553, "calib/mu_w": 0.9878823529411763, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3279681274900399, "calib/std_conf": 0.008030038366434497, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9123665338645418, "calib/step_q_c_n": 1255.0, "calib/step_q_gap": 0.007976613440138669, "calib/step_q_w": 0.9043899204244031, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 787.69140625, "completions/mean_terminated_length": 800.1945190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.013866666666666666, "grad_norm": 0.03413955122232437, "kl": 0.0016703009605407715, "learning_rate": 3.2500000000000002e-06, "loss": 0.0048, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018558364361524582, "mask/share_reasoning": 0.8621881008148193, "mask/share_step_conf": 0.10362851619720459, "num_tokens": 4050643.0, "reward": 0.6856704354286194, "reward_std": 0.37686243653297424, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6563093662261963, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3892502188682556, "step": 13 }, { "adv/mean_abs_final_conf": 0.5435507893562317, "adv/mean_abs_reasoning": 0.5240600109100342, "adv/mean_abs_step_conf": 0.7819471955299377, "adv/ratio_final_to_reasoning": 1.0371918826860145, "adv/ratio_step_to_reasoning": 1.4920947587129965, "adv/std_final_conf": 0.7767627835273743, "adv/std_reasoning": 0.7755653858184814, "adv/std_step_conf": 0.9358567595481873, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5077658303464754, "calib/avg_num_step_conf": 7.19140625, "calib/ece": 0.36823983739837396, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001515918195236221, "calib/mean_conf": 0.9901910569105691, "calib/mu_c": 0.9902483660130718, "calib/mu_w": 0.9900967741935481, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36823983739837396, "calib/std_conf": 0.0013282734150218745, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9124896193771628, "calib/step_q_c_n": 1156.0, "calib/step_q_gap": 0.006091079231177354, "calib/step_q_w": 0.9063985401459854, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 847.62109375, "completions/mean_terminated_length": 867.9640502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.014933333333333333, "grad_norm": 0.02671867609024048, "kl": 0.011602401733398438, "learning_rate": 3.5e-06, "loss": -0.0298, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017643479630351067, "mask/share_reasoning": 0.8625574707984924, "mask/share_step_conf": 0.09636152535676956, "num_tokens": 4373034.0, "reward": 0.5819574594497681, "reward_std": 0.41870835423469543, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6047573685646057, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.2474387288093567, "step": 14 }, { "adv/mean_abs_final_conf": 0.3743641972541809, "adv/mean_abs_reasoning": 0.350879430770874, "adv/mean_abs_step_conf": 0.7743525505065918, "adv/ratio_final_to_reasoning": 1.066931157610782, "adv/ratio_step_to_reasoning": 2.206890694063648, "adv/std_final_conf": 0.6611832976341248, "adv/std_reasoning": 0.6403455138206482, "adv/std_step_conf": 0.9354344010353088, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4946808510638298, "calib/avg_num_step_conf": 7.54296875, "calib/ece": 0.36603600000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.574468085082888e-05, "calib/mean_conf": 0.990036, "calib/mu_c": 0.99, "calib/mu_w": 0.9900957446808508, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36603600000000003, "calib/std_conf": 0.0005680704181701424, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9110808510638297, "calib/step_q_c_n": 1175.0, "calib/step_q_gap": 0.009850692333671085, "calib/step_q_w": 0.9012301587301587, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 751.8203125, "completions/mean_terminated_length": 766.7968139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.016, "grad_norm": 0.044145699590444565, "kl": 0.0020477771759033203, "learning_rate": 3.7500000000000005e-06, "loss": -0.0193, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019235068932175636, "mask/share_reasoning": 0.8573687672615051, "mask/share_step_conf": 0.1038648933172226, "num_tokens": 4673380.0, "reward": 0.6289956569671631, "reward_std": 0.3001975417137146, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6165511608123779, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.32425254583358765, "step": 15 }, { "adv/mean_abs_final_conf": 0.3798687756061554, "adv/mean_abs_reasoning": 0.36612629890441895, "adv/mean_abs_step_conf": 0.787256121635437, "adv/ratio_final_to_reasoning": 1.0375347980815879, "adv/ratio_step_to_reasoning": 2.150231010422331, "adv/std_final_conf": 0.6405870914459229, "adv/std_reasoning": 0.6404175758361816, "adv/std_step_conf": 0.9351932406425476, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.48214285714285715, "calib/avg_num_step_conf": 7.640625, "calib/ece": 0.32882258064516146, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003333333333332966, "calib/mean_conf": 0.9901129032258066, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9903333333333332, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32882258064516146, "calib/std_conf": 0.0010216185562653634, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119055374592835, "calib/step_q_c_n": 1228.0, "calib/step_q_gap": 0.0032544385581845248, "calib/step_q_w": 0.908651098901099, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 875.140625, "completions/mean_terminated_length": 892.57373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.017066666666666667, "grad_norm": 0.017418626695871353, "kl": 0.0017644166946411133, "learning_rate": 4.000000000000001e-06, "loss": -0.0353, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.016430053859949112, "mask/share_reasoning": 0.8740938901901245, "mask/share_step_conf": 0.08994480222463608, "num_tokens": 5006264.0, "reward": 0.617144763469696, "reward_std": 0.2961219847202301, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6468729972839355, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2655414640903473, "step": 16 }, { "adv/mean_abs_final_conf": 0.4859449863433838, "adv/mean_abs_reasoning": 0.4735613763332367, "adv/mean_abs_step_conf": 0.7723269462585449, "adv/ratio_final_to_reasoning": 1.026149957807017, "adv/ratio_step_to_reasoning": 1.6308909147925785, "adv/std_final_conf": 0.741020143032074, "adv/std_reasoning": 0.7394781708717346, "adv/std_step_conf": 0.9351745843887329, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4934108527131783, "calib/avg_num_step_conf": 7.69921875, "calib/ece": 0.2933481781376518, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": -0.0005967441860463829, "calib/mean_conf": 0.9897044534412955, "calib/mu_c": 0.9895232558139534, "calib/mu_w": 0.9901199999999998, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2933481781376518, "calib/std_conf": 0.005839078563134405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908682059046177, "calib/step_q_c_n": 1321.0, "calib/step_q_gap": -0.004825633261515194, "calib/step_q_w": 0.9135076923076922, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 784.41015625, "completions/mean_terminated_length": 809.7136840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.018133333333333335, "grad_norm": 0.019946565851569176, "kl": 0.0026388168334960938, "learning_rate": 4.25e-06, "loss": -0.0894, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017837045714259148, "mask/share_reasoning": 0.8502132892608643, "mask/share_step_conf": 0.10069967806339264, "num_tokens": 5310601.0, "reward": 0.7013490200042725, "reward_std": 0.37124860286712646, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6775288581848145, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3978252708911896, "step": 17 }, { "adv/mean_abs_final_conf": 0.4173857271671295, "adv/mean_abs_reasoning": 0.41300535202026367, "adv/mean_abs_step_conf": 0.7370244264602661, "adv/ratio_final_to_reasoning": 1.0106060977792146, "adv/ratio_step_to_reasoning": 1.7845396502854636, "adv/std_final_conf": 0.7005000114440918, "adv/std_reasoning": 0.7013863921165466, "adv/std_step_conf": 0.9352988004684448, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5036764705882353, "calib/avg_num_step_conf": 7.109375, "calib/ece": 0.434934693877551, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.617647058837495e-05, "calib/mean_conf": 0.9900367346938775, "calib/mu_c": 0.9900661764705883, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.434934693877551, "calib/std_conf": 0.0005738142619033464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.912990099009901, "calib/step_q_c_n": 1010.0, "calib/step_q_gap": 0.008213555800024586, "calib/step_q_w": 0.9047765432098764, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 828.5, "completions/mean_terminated_length": 855.2257690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.0192, "grad_norm": 0.019809581339359283, "kl": 0.004322528839111328, "learning_rate": 4.5e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017656367272138596, "mask/share_reasoning": 0.8579134941101074, "mask/share_step_conf": 0.09318015724420547, "num_tokens": 5633417.0, "reward": 0.48591434955596924, "reward_std": 0.3093951344490051, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5396703481674194, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.1345020830631256, "step": 18 }, { "adv/mean_abs_final_conf": 0.41409170627593994, "adv/mean_abs_reasoning": 0.3675980269908905, "adv/mean_abs_step_conf": 0.7662495374679565, "adv/ratio_final_to_reasoning": 1.126479675817742, "adv/ratio_step_to_reasoning": 2.0844767414568985, "adv/std_final_conf": 0.70262211561203, "adv/std_reasoning": 0.6612752079963684, "adv/std_step_conf": 0.9340653419494629, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48236096673596673, "calib/avg_num_step_conf": 7.51171875, "calib/ece": 0.4024480158730158, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.0004430093555092274, "calib/mean_conf": 0.9897496031746031, "calib/mu_c": 0.9899324324324323, "calib/mu_w": 0.9894894230769231, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4024480158730158, "calib/std_conf": 0.005816373290567585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.911044104410441, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.0006953359375346801, "calib/step_q_w": 0.9103487684729064, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 795.19921875, "completions/mean_terminated_length": 801.4606323242188, "completions/min_length": 0.0, "completions/min_terminated_length": 485.0, "epoch": 0.020266666666666665, "grad_norm": 0.024985479190945625, "kl": 0.004532575607299805, "learning_rate": 4.75e-06, "loss": -0.0252, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018527554348111153, "mask/share_reasoning": 0.8725011348724365, "mask/share_step_conf": 0.10115884244441986, "num_tokens": 5941748.0, "reward": 0.581405520439148, "reward_std": 0.3006322979927063, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5865280628204346, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.26378291845321655, "step": 19 }, { "adv/mean_abs_final_conf": 0.5181697607040405, "adv/mean_abs_reasoning": 0.5007547736167908, "adv/mean_abs_step_conf": 0.7679487466812134, "adv/ratio_final_to_reasoning": 1.0347774759319155, "adv/ratio_step_to_reasoning": 1.5335824781750285, "adv/std_final_conf": 0.7582629323005676, "adv/std_reasoning": 0.7577385902404785, "adv/std_step_conf": 0.9357162714004517, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5255102040816326, "calib/avg_num_step_conf": 7.671875, "calib/ece": 0.3893760330578513, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9793388429752066, "calib/gap": 0.013768565759637363, "calib/mean_conf": 0.9844173553719009, "calib/mu_c": 0.9899930555555557, "calib/mu_w": 0.9762244897959184, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3893760330578513, "calib/std_conf": 0.06444739180417867, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9136850044365572, "calib/step_q_c_n": 1127.0, "calib/step_q_gap": 0.008189186037512841, "calib/step_q_w": 0.9054958183990444, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 795.08203125, "completions/mean_terminated_length": 824.0526733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.021333333333333333, "grad_norm": 0.023194918408989906, "kl": 0.005844593048095703, "learning_rate": 5e-06, "loss": -0.0762, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01830507069826126, "mask/share_reasoning": 0.8429714441299438, "mask/share_step_conf": 0.1035672202706337, "num_tokens": 6250161.0, "reward": 0.5385861992835999, "reward_std": 0.3656718134880066, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5765472650527954, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.19906258583068848, "step": 20 }, { "adv/mean_abs_final_conf": 0.5136041045188904, "adv/mean_abs_reasoning": 0.5023784637451172, "adv/mean_abs_step_conf": 0.7856019735336304, "adv/ratio_final_to_reasoning": 1.0223449880595767, "adv/ratio_step_to_reasoning": 1.5637652292599216, "adv/std_final_conf": 0.7410447597503662, "adv/std_reasoning": 0.7393964529037476, "adv/std_step_conf": 0.9348511099815369, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5060240963855421, "calib/avg_num_step_conf": 7.796875, "calib/ece": 0.3339328063241106, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.036144578344896e-05, "calib/mean_conf": 0.9900592885375493, "calib/mu_c": 0.9900903614457831, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3339328063241106, "calib/std_conf": 0.0007003970413703307, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9135787401574804, "calib/step_q_c_n": 1270.0, "calib/step_q_gap": 0.0030828723888853826, "calib/step_q_w": 0.910495867768595, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 786.19921875, "completions/mean_terminated_length": 795.5217895507812, "completions/min_length": 0.0, "completions/min_terminated_length": 369.0, "epoch": 0.0224, "grad_norm": 0.014077394269406796, "kl": 0.0046427249908447266, "learning_rate": 4.9722222222222224e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018851812928915024, "mask/share_reasoning": 0.8657242059707642, "mask/share_step_conf": 0.10370523482561111, "num_tokens": 6554388.0, "reward": 0.6516726016998291, "reward_std": 0.41838204860687256, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.655136227607727, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.32086512446403503, "step": 21 }, { "adv/mean_abs_final_conf": 0.38115063309669495, "adv/mean_abs_reasoning": 0.37618327140808105, "adv/mean_abs_step_conf": 0.7508043050765991, "adv/ratio_final_to_reasoning": 1.0132046320667603, "adv/ratio_step_to_reasoning": 1.9958471365998933, "adv/std_final_conf": 0.6787269711494446, "adv/std_reasoning": 0.6815654039382935, "adv/std_step_conf": 0.935329794883728, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.34080645161290335, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34080645161290335, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9151821862348178, "calib/step_q_c_n": 1235.0, "calib/step_q_gap": -0.004241897534815742, "calib/step_q_w": 0.9194240837696336, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 779.8046875, "completions/mean_terminated_length": 795.3386840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.023466666666666667, "grad_norm": 0.15635307133197784, "kl": 0.022773265838623047, "learning_rate": 4.944444444444445e-06, "loss": -0.0687, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01855219155550003, "mask/share_reasoning": 0.8577111959457397, "mask/share_step_conf": 0.10420538485050201, "num_tokens": 6855834.0, "reward": 0.6378905773162842, "reward_std": 0.28502583503723145, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6356062293052673, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.32064372301101685, "step": 22 }, { "adv/mean_abs_final_conf": 0.47258201241493225, "adv/mean_abs_reasoning": 0.45605725049972534, "adv/mean_abs_step_conf": 0.7994668483734131, "adv/ratio_final_to_reasoning": 1.0362339638216471, "adv/ratio_step_to_reasoning": 1.7529966851692331, "adv/std_final_conf": 0.7200669646263123, "adv/std_reasoning": 0.720693826675415, "adv/std_step_conf": 0.9347937107086182, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.55078125, "calib/ece": 0.41799600000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.345794392379148e-06, "calib/mean_conf": 0.9899960000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899906542056074, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41799600000000015, "calib/std_conf": 0.0008508724933854668, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9142924086223055, "calib/step_q_c_n": 1067.0, "calib/step_q_gap": -0.0032410786756159604, "calib/step_q_w": 0.9175334872979215, "calib/step_q_w_n": 866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2410.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 800.8828125, "completions/mean_terminated_length": 816.836669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.024533333333333334, "grad_norm": 0.01701655611395836, "kl": 0.007472038269042969, "learning_rate": 4.9166666666666665e-06, "loss": -0.0472, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01825655996799469, "mask/share_reasoning": 0.8625786900520325, "mask/share_step_conf": 0.09963353723287582, "num_tokens": 7164796.0, "reward": 0.5432964563369751, "reward_std": 0.36795392632484436, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5667847394943237, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.2135581225156784, "step": 23 }, { "adv/mean_abs_final_conf": 0.5871478319168091, "adv/mean_abs_reasoning": 0.552639365196228, "adv/mean_abs_step_conf": 0.7762830257415771, "adv/ratio_final_to_reasoning": 1.0624430123763045, "adv/ratio_step_to_reasoning": 1.404682826866557, "adv/std_final_conf": 0.8089284300804138, "adv/std_reasoning": 0.792900562286377, "adv/std_step_conf": 0.9361394047737122, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5113929201139291, "calib/avg_num_step_conf": 7.4375, "calib/ece": 0.3977732793522267, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9878542510121457, "calib/gap": 0.0012647497626475657, "calib/mean_conf": 0.9888663967611336, "calib/mu_c": 0.9893835616438356, "calib/mu_w": 0.988118811881188, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3977732793522267, "calib/std_conf": 0.009874227503863084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.914830659536542, "calib/step_q_c_n": 1122.0, "calib/step_q_gap": 0.005738587925288696, "calib/step_q_w": 0.9090920716112533, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 843.9375, "completions/mean_terminated_length": 860.7490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.0256, "grad_norm": 0.030872756615281105, "kl": 0.007044792175292969, "learning_rate": 4.888888888888889e-06, "loss": -0.0169, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01767241582274437, "mask/share_reasoning": 0.8637890815734863, "mask/share_step_conf": 0.09900720417499542, "num_tokens": 7485356.0, "reward": 0.5760669708251953, "reward_std": 0.45210501551628113, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5794737935066223, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.26562875509262085, "step": 24 }, { "adv/mean_abs_final_conf": 0.348545640707016, "adv/mean_abs_reasoning": 0.3495147228240967, "adv/mean_abs_step_conf": 0.7539834380149841, "adv/ratio_final_to_reasoning": 0.9972273496542564, "adv/ratio_step_to_reasoning": 2.1572294063115844, "adv/std_final_conf": 0.6166334748268127, "adv/std_reasoning": 0.6185421347618103, "adv/std_step_conf": 0.9329771995544434, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.2578125, "calib/ece": 0.34856573705179283, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34856573705179283, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9151369863013697, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.0017456819535436319, "calib/step_q_w": 0.9133913043478261, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 785.921875, "completions/mean_terminated_length": 789.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 0.02666666666666667, "grad_norm": 0.02121826820075512, "kl": 0.007907867431640625, "learning_rate": 4.861111111111111e-06, "loss": 0.0238, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01904113218188286, "mask/share_reasoning": 0.876044750213623, "mask/share_step_conf": 0.10100783407688141, "num_tokens": 7789776.0, "reward": 0.6249831318855286, "reward_std": 0.2563919126987457, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6358394622802734, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2922517657279968, "step": 25 }, { "adv/mean_abs_final_conf": 0.4268883466720581, "adv/mean_abs_reasoning": 0.41854697465896606, "adv/mean_abs_step_conf": 0.7563217878341675, "adv/ratio_final_to_reasoning": 1.0199293568419379, "adv/ratio_step_to_reasoning": 1.8070176912649334, "adv/std_final_conf": 0.7015519142150879, "adv/std_reasoning": 0.7013825178146362, "adv/std_step_conf": 0.9330384135246277, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.1171875, "calib/ece": 0.34317460317460324, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34317460317460324, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9113718723037103, "calib/step_q_c_n": 1159.0, "calib/step_q_gap": -0.0010564836540575273, "calib/step_q_w": 0.9124283559577678, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 754.95703125, "completions/mean_terminated_length": 760.9015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.027733333333333332, "grad_norm": 0.01634961925446987, "kl": 0.010264396667480469, "learning_rate": 4.833333333333333e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019156184047460556, "mask/share_reasoning": 0.8739191293716431, "mask/share_step_conf": 0.0991121456027031, "num_tokens": 8088285.0, "reward": 0.667913019657135, "reward_std": 0.2968505620956421, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6435734033584595, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.36803382635116577, "step": 26 }, { "adv/mean_abs_final_conf": 0.5693494081497192, "adv/mean_abs_reasoning": 0.5429830551147461, "adv/mean_abs_step_conf": 0.7924162745475769, "adv/ratio_final_to_reasoning": 1.048558334899422, "adv/ratio_step_to_reasoning": 1.4593756970558118, "adv/std_final_conf": 0.7916540503501892, "adv/std_reasoning": 0.7754848599433899, "adv/std_step_conf": 0.9357017278671265, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49655720338983045, "calib/avg_num_step_conf": 7.5390625, "calib/ece": 0.4689837398373984, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": -0.0010023834745759297, "calib/mean_conf": 0.989308943089431, "calib/mu_c": 0.9888281250000002, "calib/mu_w": 0.9898305084745761, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4689837398373984, "calib/std_conf": 0.006370192376644552, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9138975966562173, "calib/step_q_c_n": 957.0, "calib/step_q_gap": -0.001621416704522649, "calib/step_q_w": 0.9155190133607399, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 775.19140625, "completions/mean_terminated_length": 796.98388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.0288, "grad_norm": 0.03222472965717316, "kl": 0.0107879638671875, "learning_rate": 4.805555555555556e-06, "loss": -0.0881, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01808036118745804, "mask/share_reasoning": 0.8571000099182129, "mask/share_step_conf": 0.09747587144374847, "num_tokens": 8391950.0, "reward": 0.48863452672958374, "reward_std": 0.4100489318370819, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5092262029647827, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.17585542798042297, "step": 27 }, { "adv/mean_abs_final_conf": 0.4156338572502136, "adv/mean_abs_reasoning": 0.38927507400512695, "adv/mean_abs_step_conf": 0.7773406505584717, "adv/ratio_final_to_reasoning": 1.0677124866328829, "adv/ratio_step_to_reasoning": 1.9968929491443208, "adv/std_final_conf": 0.6787872910499573, "adv/std_reasoning": 0.6614362001419067, "adv/std_step_conf": 0.9353095293045044, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5175358970127084, "calib/avg_num_step_conf": 6.5, "calib/ece": 0.2940585774058576, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9874476987447699, "calib/gap": 0.00274550255817807, "calib/mean_conf": 0.9886192468619246, "calib/mu_c": 0.9894578313253012, "calib/mu_w": 0.9867123287671231, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2940585774058576, "calib/std_conf": 0.012177960658197895, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9127568493150684, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.007131849315068273, "calib/step_q_w": 0.9056250000000001, "calib/step_q_w_n": 496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 780.625, "completions/mean_terminated_length": 793.0159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.029866666666666666, "grad_norm": 0.030569037422537804, "kl": 0.013233184814453125, "learning_rate": 4.777777777777778e-06, "loss": -0.0525, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018613245338201523, "mask/share_reasoning": 0.8756033182144165, "mask/share_step_conf": 0.09015839546918869, "num_tokens": 8698734.0, "reward": 0.661705732345581, "reward_std": 0.299774169921875, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6507663726806641, "rewards/format_reward_step": 0.92578125, "rewards/step_correlation_reward": 0.35780128836631775, "step": 28 }, { "adv/mean_abs_final_conf": 0.4315052628517151, "adv/mean_abs_reasoning": 0.39599186182022095, "adv/mean_abs_step_conf": 0.7412615418434143, "adv/ratio_final_to_reasoning": 1.0896821486892503, "adv/ratio_step_to_reasoning": 1.8719110499799734, "adv/std_final_conf": 0.7103040218353271, "adv/std_reasoning": 0.70142662525177, "adv/std_step_conf": 0.9351839423179626, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4931972789115646, "calib/avg_num_step_conf": 6.7265625, "calib/ece": 0.38733606557377054, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003401360544218468, "calib/mean_conf": 0.9897950819672131, "calib/mu_c": 0.9896598639455781, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38733606557377054, "calib/std_conf": 0.002299103619304223, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9086032977691563, "calib/step_q_c_n": 1031.0, "calib/step_q_gap": 0.00427623554050216, "calib/step_q_w": 0.9043270622286541, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 849.328125, "completions/mean_terminated_length": 869.7120361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.030933333333333334, "grad_norm": 0.016793156042695045, "kl": 0.011755943298339844, "learning_rate": 4.75e-06, "loss": -0.0486, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.0170335303992033, "mask/share_reasoning": 0.8732938766479492, "mask/share_step_conf": 0.0862351506948471, "num_tokens": 9023290.0, "reward": 0.5508725643157959, "reward_std": 0.2956800162792206, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5816925764083862, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.21458379924297333, "step": 29 }, { "adv/mean_abs_final_conf": 0.5231447219848633, "adv/mean_abs_reasoning": 0.4696976840496063, "adv/mean_abs_step_conf": 0.7574329376220703, "adv/ratio_final_to_reasoning": 1.1137902948007983, "adv/ratio_step_to_reasoning": 1.6125967049521908, "adv/std_final_conf": 0.778552234172821, "adv/std_reasoning": 0.7393490076065063, "adv/std_step_conf": 0.9359943866729736, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5002841716396703, "calib/avg_num_step_conf": 7.2265625, "calib/ece": 0.3659959183673469, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": -0.000948920147769039, "calib/mean_conf": 0.9888530612244898, "calib/mu_c": 0.9884967320261439, "calib/mu_w": 0.9894456521739129, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.36517959183673465, "calib/std_conf": 0.01233192475568696, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9107060900264783, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": 0.004667038422572989, "calib/step_q_w": 0.9060390516039053, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 824.5390625, "completions/mean_terminated_length": 837.6270141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.032, "grad_norm": 0.030233023688197136, "kl": 0.01520538330078125, "learning_rate": 4.722222222222222e-06, "loss": -0.0384, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017376385629177094, "mask/share_reasoning": 0.8753885626792908, "mask/share_step_conf": 0.09161008894443512, "num_tokens": 9341356.0, "reward": 0.6088252067565918, "reward_std": 0.36066246032714844, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6049777269363403, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.3017350435256958, "step": 30 }, { "adv/mean_abs_final_conf": 0.5415816307067871, "adv/mean_abs_reasoning": 0.44430142641067505, "adv/mean_abs_step_conf": 0.7793623208999634, "adv/ratio_final_to_reasoning": 1.2189509160076257, "adv/ratio_step_to_reasoning": 1.7541296844264147, "adv/std_final_conf": 0.7959402203559875, "adv/std_reasoning": 0.7207141518592834, "adv/std_step_conf": 0.935605525970459, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5253833950290852, "calib/avg_num_step_conf": 7.27734375, "calib/ece": 0.4842276422764228, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": 0.0018164992067687402, "calib/mean_conf": 0.9882926829268293, "calib/mu_c": 0.9891935483870967, "calib/mu_w": 0.987377049180328, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4842276422764228, "calib/std_conf": 0.008853580147024696, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9126297968397293, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.019088343410865405, "calib/step_q_w": 0.8935414534288639, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 894.51171875, "completions/mean_terminated_length": 912.3306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.03306666666666667, "grad_norm": 0.02490750513970852, "kl": 0.015928268432617188, "learning_rate": 4.694444444444445e-06, "loss": -0.0297, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01670205220580101, "mask/share_reasoning": 0.8775709867477417, "mask/share_step_conf": 0.08619573712348938, "num_tokens": 9676263.0, "reward": 0.450461208820343, "reward_std": 0.3570416271686554, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.49612146615982056, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.11651960015296936, "step": 31 }, { "adv/mean_abs_final_conf": 0.5400800108909607, "adv/mean_abs_reasoning": 0.45109522342681885, "adv/mean_abs_step_conf": 0.7773272395133972, "adv/ratio_final_to_reasoning": 1.1972638654609427, "adv/ratio_step_to_reasoning": 1.7231998902766101, "adv/std_final_conf": 0.784846842288971, "adv/std_reasoning": 0.7394152879714966, "adv/std_step_conf": 0.9346668124198914, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5321026197909596, "calib/avg_num_step_conf": 6.65234375, "calib/ece": 0.4194693877551021, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9877551020408163, "calib/gap": 0.003201438848920768, "calib/mean_conf": 0.9868163265306124, "calib/mu_c": 0.9882014388489208, "calib/mu_w": 0.985, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4194693877551021, "calib/std_conf": 0.011835045621921766, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9128676470588235, "calib/step_q_c_n": 952.0, "calib/step_q_gap": 0.006036754915015097, "calib/step_q_w": 0.9068308921438084, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 804.30078125, "completions/mean_terminated_length": 830.2459716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.034133333333333335, "grad_norm": 0.046489838510751724, "kl": 0.021024703979492188, "learning_rate": 4.666666666666667e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017999248579144478, "mask/share_reasoning": 0.8620076179504395, "mask/share_step_conf": 0.08874315023422241, "num_tokens": 9988868.0, "reward": 0.533851146697998, "reward_std": 0.3655565083026886, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5549355745315552, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.213547945022583, "step": 32 }, { "adv/mean_abs_final_conf": 0.5530176162719727, "adv/mean_abs_reasoning": 0.4229363203048706, "adv/mean_abs_step_conf": 0.754909098148346, "adv/ratio_final_to_reasoning": 1.3075670963263073, "adv/ratio_step_to_reasoning": 1.7849237861722898, "adv/std_final_conf": 0.7939488887786865, "adv/std_reasoning": 0.7013248205184937, "adv/std_step_conf": 0.9332074522972107, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5538732394366197, "calib/avg_num_step_conf": 6.875, "calib/ece": 0.4250396825396827, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.001911651728553232, "calib/mean_conf": 0.9885317460317462, "calib/mu_c": 0.9893661971830987, "calib/mu_w": 0.9874545454545455, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4250396825396827, "calib/std_conf": 0.00659387222500005, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9126046986721145, "calib/step_q_c_n": 979.0, "calib/step_q_gap": 0.006138629529988893, "calib/step_q_w": 0.9064660691421256, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 748.5625, "completions/mean_terminated_length": 760.4444580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0352, "grad_norm": 0.09581245481967926, "kl": 0.022441864013671875, "learning_rate": 4.638888888888889e-06, "loss": -0.0162, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019542766734957695, "mask/share_reasoning": 0.8670825958251953, "mask/share_step_conf": 0.09774963557720184, "num_tokens": 10287372.0, "reward": 0.5962920188903809, "reward_std": 0.3191433548927307, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5652187466621399, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3203340768814087, "step": 33 }, { "adv/mean_abs_final_conf": 0.6945086121559143, "adv/mean_abs_reasoning": 0.6001378297805786, "adv/mean_abs_step_conf": 0.7928270101547241, "adv/ratio_final_to_reasoning": 1.1572485147450868, "adv/ratio_step_to_reasoning": 1.321074877823643, "adv/std_final_conf": 0.8546317219734192, "adv/std_reasoning": 0.8099603652954102, "adv/std_step_conf": 0.9364176988601685, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.481389252948886, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.4232530120481929, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9839357429718876, "calib/gap": -4.5871559635202175e-06, "calib/mean_conf": 0.9855020080321286, "calib/mu_c": 0.9854999999999998, "calib/mu_w": 0.9855045871559633, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4232530120481929, "calib/std_conf": 0.011883343443218902, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9124770642201836, "calib/step_q_c_n": 981.0, "calib/step_q_gap": 0.0011754371863912372, "calib/step_q_w": 0.9113016270337924, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 747.47265625, "completions/mean_terminated_length": 753.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.03626666666666667, "grad_norm": 0.03602541610598564, "kl": 0.029666900634765625, "learning_rate": 4.611111111111112e-06, "loss": -0.0109, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01917748898267746, "mask/share_reasoning": 0.8739994764328003, "mask/share_step_conf": 0.09901049733161926, "num_tokens": 10583837.0, "reward": 0.5894248485565186, "reward_std": 0.45810455083847046, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5588769316673279, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.31606653332710266, "step": 34 }, { "adv/mean_abs_final_conf": 0.5951634645462036, "adv/mean_abs_reasoning": 0.46525806188583374, "adv/mean_abs_step_conf": 0.7743443846702576, "adv/ratio_final_to_reasoning": 1.279211502824526, "adv/ratio_step_to_reasoning": 1.6643330833034942, "adv/std_final_conf": 0.8165079951286316, "adv/std_reasoning": 0.7393738031387329, "adv/std_step_conf": 0.9352921843528748, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5283114943309117, "calib/avg_num_step_conf": 7.22265625, "calib/ece": 0.3974796747967483, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.967479674796748, "calib/gap": 0.0045827958449314865, "calib/mean_conf": 0.9787804878048784, "calib/mu_c": 0.9806993006993008, "calib/mu_w": 0.9761165048543693, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3974796747967483, "calib/std_conf": 0.01734796283130076, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9066231155778894, "calib/step_q_c_n": 995.0, "calib/step_q_gap": -6.306709189984883e-05, "calib/step_q_w": 0.9066861826697893, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 812.34375, "completions/mean_terminated_length": 838.54833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 473.0, "epoch": 0.037333333333333336, "grad_norm": 0.072185218334198, "kl": 0.034183502197265625, "learning_rate": 4.583333333333333e-06, "loss": -0.1212, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017634199932217598, "mask/share_reasoning": 0.8655506372451782, "mask/share_step_conf": 0.08556517958641052, "num_tokens": 10901053.0, "reward": 0.5513752698898315, "reward_std": 0.343622088432312, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5770906209945679, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.22175362706184387, "step": 35 }, { "adv/mean_abs_final_conf": 0.6594531536102295, "adv/mean_abs_reasoning": 0.3306323289871216, "adv/mean_abs_step_conf": 0.7618266344070435, "adv/ratio_final_to_reasoning": 1.9945210912388298, "adv/ratio_step_to_reasoning": 2.3041504644777713, "adv/std_final_conf": 0.8654852509498596, "adv/std_reasoning": 0.6185620427131653, "adv/std_step_conf": 0.9333191514015198, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5999921235034656, "calib/avg_num_step_conf": 7.75, "calib/ece": 0.2398023715415022, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8300395256916996, "calib/gap": 0.007934782608695734, "calib/mean_conf": 0.9670750988142295, "calib/mu_c": 0.9692391304347828, "calib/mu_w": 0.9613043478260871, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2398023715415022, "calib/std_conf": 0.03087879999322926, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.91320987654321, "calib/step_q_c_n": 1458.0, "calib/step_q_gap": 0.008457024832183246, "calib/step_q_w": 0.9047528517110267, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 791.72265625, "completions/mean_terminated_length": 791.72265625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.0384, "grad_norm": 0.11893506348133087, "kl": 0.048084259033203125, "learning_rate": 4.555555555555556e-06, "loss": 0.0514, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019558222964406013, "mask/share_reasoning": 0.8685697317123413, "mask/share_step_conf": 0.11187203228473663, "num_tokens": 11206446.0, "reward": 0.7723823189735413, "reward_std": 0.2554051876068115, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7375956773757935, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.4657626152038574, "step": 36 }, { "adv/mean_abs_final_conf": 0.5802910327911377, "adv/mean_abs_reasoning": 0.41940683126449585, "adv/mean_abs_step_conf": 0.7726558446884155, "adv/ratio_final_to_reasoning": 1.3835993825889341, "adv/ratio_step_to_reasoning": 1.8422586068970006, "adv/std_final_conf": 0.8165745735168457, "adv/std_reasoning": 0.701346755027771, "adv/std_step_conf": 0.9323224425315857, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5297814207650273, "calib/avg_num_step_conf": 7.4453125, "calib/ece": 0.4716942148760333, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8677685950413223, "calib/gap": 0.0015300546448088204, "calib/mean_conf": 0.9675619834710747, "calib/mu_c": 0.9683333333333337, "calib/mu_w": 0.9668032786885249, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4716942148760333, "calib/std_conf": 0.030223791340730083, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.904900442477876, "calib/step_q_c_n": 904.0, "calib/step_q_gap": 0.011297648066698307, "calib/step_q_w": 0.8936027944111777, "calib/step_q_w_n": 1002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 782.68359375, "completions/mean_terminated_length": 814.5, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.039466666666666664, "grad_norm": 0.04534284025430679, "kl": 0.048694610595703125, "learning_rate": 4.527777777777778e-06, "loss": -0.0298, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018106989562511444, "mask/share_reasoning": 0.8466689586639404, "mask/share_step_conf": 0.09616149961948395, "num_tokens": 11513909.0, "reward": 0.44464361667633057, "reward_std": 0.29512786865234375, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.49853241443634033, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.10794225335121155, "step": 37 }, { "adv/mean_abs_final_conf": 0.6019595861434937, "adv/mean_abs_reasoning": 0.4334124028682709, "adv/mean_abs_step_conf": 0.7732325792312622, "adv/ratio_final_to_reasoning": 1.3888840793659754, "adv/ratio_step_to_reasoning": 1.784057341493004, "adv/std_final_conf": 0.8388135433197021, "adv/std_reasoning": 0.7206992506980896, "adv/std_step_conf": 0.9356791377067566, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5753998742753371, "calib/avg_num_step_conf": 7.16796875, "calib/ece": 0.39785123966942176, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8966942148760331, "calib/gap": 0.0067542082838580875, "calib/mean_conf": 0.9705785123966945, "calib/mu_c": 0.9734532374100722, "calib/mu_w": 0.9666990291262141, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39702479338843005, "calib/std_conf": 0.02669997123411818, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9112463768115943, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.009346376811594337, "calib/step_q_w": 0.9018999999999999, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 797.734375, "completions/mean_terminated_length": 816.8800659179688, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.04053333333333333, "grad_norm": 0.037656933069229126, "kl": 0.05291748046875, "learning_rate": 4.5e-06, "loss": 0.0564, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01798248291015625, "mask/share_reasoning": 0.8641847968101501, "mask/share_step_conf": 0.09439519047737122, "num_tokens": 11825017.0, "reward": 0.5232806205749512, "reward_std": 0.3373889923095703, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5682734251022339, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.1806315928697586, "step": 38 }, { "adv/mean_abs_final_conf": 0.662196695804596, "adv/mean_abs_reasoning": 0.4865483045578003, "adv/mean_abs_step_conf": 0.7729369401931763, "adv/ratio_final_to_reasoning": 1.361009152845438, "adv/ratio_step_to_reasoning": 1.5886129556975035, "adv/std_final_conf": 0.8549278378486633, "adv/std_reasoning": 0.7576348185539246, "adv/std_step_conf": 0.9355428218841553, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5317412736477485, "calib/avg_num_step_conf": 7.30859375, "calib/ece": 0.4031578947368425, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9068825910931174, "calib/gap": 0.012642552624566816, "calib/mean_conf": 0.9659109311740894, "calib/mu_c": 0.9714388489208635, "calib/mu_w": 0.9587962962962967, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4031578947368425, "calib/std_conf": 0.06733113652157344, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9081381957773512, "calib/step_q_c_n": 1042.0, "calib/step_q_gap": 0.006678605910041102, "calib/step_q_w": 0.9014595898673101, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2840.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 804.484375, "completions/mean_terminated_length": 820.510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.0416, "grad_norm": 0.29108595848083496, "kl": 0.087554931640625, "learning_rate": 4.472222222222223e-06, "loss": -0.0154, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01814432442188263, "mask/share_reasoning": 0.8632323741912842, "mask/share_step_conf": 0.09909210354089737, "num_tokens": 12137053.0, "reward": 0.5462459325790405, "reward_std": 0.3595222234725952, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5637624859809875, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.22951051592826843, "step": 39 }, { "adv/mean_abs_final_conf": 0.6811327338218689, "adv/mean_abs_reasoning": 0.4848672151565552, "adv/mean_abs_step_conf": 0.7687471508979797, "adv/ratio_final_to_reasoning": 1.4047819950085572, "adv/ratio_step_to_reasoning": 1.5854797496460236, "adv/std_final_conf": 0.8914880156517029, "adv/std_reasoning": 0.7576228380203247, "adv/std_step_conf": 0.9358471632003784, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5200105666358473, "calib/avg_num_step_conf": 7.671875, "calib/ece": 0.4160323886639678, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8178137651821862, "calib/gap": -0.0008922203143574281, "calib/mean_conf": 0.9585425101214577, "calib/mu_c": 0.9581343283582091, "calib/mu_w": 0.9590265486725665, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4160323886639678, "calib/std_conf": 0.03918580795538473, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9042110762800419, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.013357054432971416, "calib/step_q_w": 0.8908540218470705, "calib/step_q_w_n": 1007.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 827.82421875, "completions/mean_terminated_length": 844.3147583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.042666666666666665, "grad_norm": 0.028931770473718643, "kl": 0.07387542724609375, "learning_rate": 4.444444444444444e-06, "loss": 0.0035, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017673280090093613, "mask/share_reasoning": 0.8669548630714417, "mask/share_step_conf": 0.09584060311317444, "num_tokens": 12455736.0, "reward": 0.49367809295654297, "reward_std": 0.3905242681503296, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5518601536750793, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.13940228521823883, "step": 40 }, { "adv/mean_abs_final_conf": 0.6893098950386047, "adv/mean_abs_reasoning": 0.46814632415771484, "adv/mean_abs_step_conf": 0.7652798891067505, "adv/ratio_final_to_reasoning": 1.4724240252848415, "adv/ratio_step_to_reasoning": 1.6347023347532121, "adv/std_final_conf": 0.8758644461631775, "adv/std_reasoning": 0.739457368850708, "adv/std_step_conf": 0.9358088374137878, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5544559809647416, "calib/avg_num_step_conf": 8.25, "calib/ece": 0.14963562753036444, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8704453441295547, "calib/gap": 0.01726908933592952, "calib/mean_conf": 0.9634008097165994, "calib/mu_c": 0.9666169154228856, "calib/mu_w": 0.9493478260869561, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14963562753036444, "calib/std_conf": 0.03984500943245332, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8970809061488674, "calib/step_q_c_n": 1545.0, "calib/step_q_gap": -0.031472885738257905, "calib/step_q_w": 0.9285537918871253, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 774.76953125, "completions/mean_terminated_length": 783.95654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.04373333333333333, "grad_norm": 0.03703810274600983, "kl": 0.09344482421875, "learning_rate": 4.416666666666667e-06, "loss": -0.0483, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019327331334352493, "mask/share_reasoning": 0.8618630170822144, "mask/share_step_conf": 0.10709092766046524, "num_tokens": 12761325.0, "reward": 0.8326570987701416, "reward_std": 0.36482173204421997, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.7927339673042297, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.5241426825523376, "step": 41 }, { "adv/mean_abs_final_conf": 0.7318501472473145, "adv/mean_abs_reasoning": 0.39691048860549927, "adv/mean_abs_step_conf": 0.7763651609420776, "adv/ratio_final_to_reasoning": 1.843866988293981, "adv/ratio_step_to_reasoning": 1.9560207735244035, "adv/std_final_conf": 0.9062535762786865, "adv/std_reasoning": 0.7015131115913391, "adv/std_step_conf": 0.9363811016082764, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.494484412470024, "calib/avg_num_step_conf": 7.34375, "calib/ece": 0.39168032786885265, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.8647540983606558, "calib/gap": 0.009730044535799953, "calib/mean_conf": 0.9613524590163937, "calib/mu_c": 0.9655395683453238, "calib/mu_w": 0.9558095238095239, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.39168032786885265, "calib/std_conf": 0.04527030332133524, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8892323030907279, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": -0.012842953465714735, "calib/step_q_w": 0.9020752565564426, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 725.21875, "completions/mean_terminated_length": 736.730224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.0448, "grad_norm": 0.026927631348371506, "kl": 0.126861572265625, "learning_rate": 4.388888888888889e-06, "loss": -0.0247, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019635125994682312, "mask/share_reasoning": 0.8590686321258545, "mask/share_step_conf": 0.10567127168178558, "num_tokens": 13051349.0, "reward": 0.5205029845237732, "reward_std": 0.3840179443359375, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.556390643119812, "rewards/format_reward_step": 0.9296875, "rewards/step_correlation_reward": 0.1900840848684311, "step": 42 }, { "adv/mean_abs_final_conf": 0.7539446353912354, "adv/mean_abs_reasoning": 0.48671162128448486, "adv/mean_abs_step_conf": 0.7926192879676819, "adv/ratio_final_to_reasoning": 1.5490582152147785, "adv/ratio_step_to_reasoning": 1.6285193393900754, "adv/std_final_conf": 0.9174970984458923, "adv/std_reasoning": 0.7395156025886536, "adv/std_step_conf": 0.9361217021942139, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5945800706279747, "calib/avg_num_step_conf": 7.27734375, "calib/ece": 0.27718367346938777, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8326530612244898, "calib/gap": 0.012746046368800723, "calib/mean_conf": 0.9588163265306123, "calib/mu_c": 0.962874251497006, "calib/mu_w": 0.9501282051282053, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27718367346938777, "calib/std_conf": 0.03810601402499308, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8881299524564185, "calib/step_q_c_n": 1262.0, "calib/step_q_gap": -0.0035505799895049206, "calib/step_q_w": 0.8916805324459234, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 798.6171875, "completions/mean_terminated_length": 801.7490844726562, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.04586666666666667, "grad_norm": 0.04725484177470207, "kl": 0.1308135986328125, "learning_rate": 4.361111111111112e-06, "loss": 0.0007, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01898164488375187, "mask/share_reasoning": 0.8755961060523987, "mask/share_step_conf": 0.10151597857475281, "num_tokens": 13361019.0, "reward": 0.6691012382507324, "reward_std": 0.41626691818237305, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6680905818939209, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.34979933500289917, "step": 43 }, { "adv/mean_abs_final_conf": 0.7528752088546753, "adv/mean_abs_reasoning": 0.5847516059875488, "adv/mean_abs_step_conf": 0.7902089953422546, "adv/ratio_final_to_reasoning": 1.2875128535700102, "adv/ratio_step_to_reasoning": 1.3513584011585948, "adv/std_final_conf": 0.9186169505119324, "adv/std_reasoning": 0.8267921209335327, "adv/std_step_conf": 0.9367049932479858, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.535816108339273, "calib/avg_num_step_conf": 6.92578125, "calib/ece": 0.47810126582278495, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.8649789029535865, "calib/gap": 0.008359230220955194, "calib/mean_conf": 0.9599578059071732, "calib/mu_c": 0.9642608695652175, "calib/mu_w": 0.9559016393442623, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.4764135021097048, "calib/std_conf": 0.04752523549324154, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8809450830140485, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.006056194125159564, "calib/step_q_w": 0.8748888888888889, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 798.7109375, "completions/mean_terminated_length": 824.4757690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.046933333333333334, "grad_norm": 0.03326358273625374, "kl": 0.1516876220703125, "learning_rate": 4.333333333333334e-06, "loss": -0.1195, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01732334867119789, "mask/share_reasoning": 0.8651239275932312, "mask/share_step_conf": 0.0863027423620224, "num_tokens": 13671809.0, "reward": 0.40459272265434265, "reward_std": 0.41235947608947754, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.4757663905620575, "rewards/format_reward_step": 0.90625, "rewards/step_correlation_reward": 0.06232527643442154, "step": 44 }, { "adv/mean_abs_final_conf": 0.7470870018005371, "adv/mean_abs_reasoning": 0.5880221128463745, "adv/mean_abs_step_conf": 0.8088736534118652, "adv/ratio_final_to_reasoning": 1.2705083456542043, "adv/ratio_step_to_reasoning": 1.3755837335715808, "adv/std_final_conf": 0.905470609664917, "adv/std_reasoning": 0.8432220220565796, "adv/std_step_conf": 0.9366260170936584, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.559177646524307, "calib/avg_num_step_conf": 8.69140625, "calib/ece": 0.3577021276595747, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.87109375, "calib/frac_conf_gt_0.9": 0.8851063829787233, "calib/gap": 0.013560502801757024, "calib/mean_conf": 0.9619574468085108, "calib/mu_c": 0.9673239436619719, "calib/mu_w": 0.9537634408602149, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.3577021276595747, "calib/std_conf": 0.05073500509921511, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.8776923076923078, "calib/step_q_c_n": 1040.0, "calib/step_q_gap": -0.033767607919506504, "calib/step_q_w": 0.9114599156118143, "calib/step_q_w_n": 1185.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 749.8828125, "completions/mean_terminated_length": 783.5509643554688, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.048, "grad_norm": 0.028959548100829124, "kl": 0.16290283203125, "learning_rate": 4.305555555555556e-06, "loss": -0.0939, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.018562179058790207, "mask/share_reasoning": 0.8374760150909424, "mask/share_step_conf": 0.10099305957555771, "num_tokens": 13968827.0, "reward": 0.502269446849823, "reward_std": 0.403462290763855, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5527081489562988, "rewards/format_reward_step": 0.87109375, "rewards/step_correlation_reward": 0.16667440533638, "step": 45 }, { "adv/mean_abs_final_conf": 0.734359085559845, "adv/mean_abs_reasoning": 0.606724739074707, "adv/mean_abs_step_conf": 0.8087029457092285, "adv/ratio_final_to_reasoning": 1.2103661483785684, "adv/ratio_step_to_reasoning": 1.3328992434733267, "adv/std_final_conf": 0.9238776564598083, "adv/std_reasoning": 0.8433163166046143, "adv/std_step_conf": 0.9367368817329407, "calib/answer_extract_rate": 0.8671875, "calib/auroc": 0.5771551367881643, "calib/avg_num_step_conf": 7.45703125, "calib/ece": 0.45181818181818206, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.81640625, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.013158112240681086, "calib/mean_conf": 0.9563636363636365, "calib/mu_c": 0.9628828828828832, "calib/mu_w": 0.9497247706422021, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.45181818181818206, "calib/std_conf": 0.04823667546073352, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.8656472081218276, "calib/step_q_c_n": 788.0, "calib/step_q_gap": -0.005030757979867295, "calib/step_q_w": 0.8706779661016949, "calib/step_q_w_n": 1121.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 844.49609375, "completions/mean_terminated_length": 889.6748657226562, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.04906666666666667, "grad_norm": 0.020218025892972946, "kl": 0.149078369140625, "learning_rate": 4.277777777777778e-06, "loss": -0.101, "mask/has_final_conf_rate": 0.859375, "mask/share_final_conf": 0.01695256121456623, "mask/share_reasoning": 0.8441320657730103, "mask/share_step_conf": 0.08813411742448807, "num_tokens": 14289786.0, "reward": 0.39327341318130493, "reward_std": 0.393865168094635, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.45081520080566406, "rewards/format_reward_step": 0.81640625, "rewards/step_correlation_reward": 0.08573156595230103, "step": 46 }, { "adv/mean_abs_final_conf": 0.7517024278640747, "adv/mean_abs_reasoning": 0.629054844379425, "adv/mean_abs_step_conf": 0.7849420309066772, "adv/ratio_final_to_reasoning": 1.1949712089184272, "adv/ratio_step_to_reasoning": 1.2478117574645466, "adv/std_final_conf": 0.919205367565155, "adv/std_reasoning": 0.8432762026786804, "adv/std_step_conf": 0.9367557168006897, "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5730214621059692, "calib/avg_num_step_conf": 6.74609375, "calib/ece": 0.3327433628318586, "calib/final_conf_rate": 0.8828125, "calib/format_rate": 0.83984375, "calib/frac_conf_gt_0.9": 0.911504424778761, "calib/gap": 0.030679074446680166, "calib/mean_conf": 0.9610619469026551, "calib/mu_c": 0.9724647887323944, "calib/mu_w": 0.9417857142857142, "calib/nonempty_final_conf_rate": 0.8828125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.3327433628318586, "calib/std_conf": 0.07747523534513975, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.8705286343612335, "calib/step_q_c_n": 908.0, "calib/step_q_gap": -0.0069072630746639385, "calib/step_q_w": 0.8774358974358974, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 800.52734375, "completions/mean_terminated_length": 823.0321044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.050133333333333335, "grad_norm": 0.021257705986499786, "kl": 0.1793212890625, "learning_rate": 4.25e-06, "loss": -0.0244, "mask/has_final_conf_rate": 0.8828125, "mask/share_final_conf": 0.017377939075231552, "mask/share_reasoning": 0.8701250553131104, "mask/share_step_conf": 0.0851532369852066, "num_tokens": 14600697.0, "reward": 0.5386103987693787, "reward_std": 0.39626145362854004, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5514390468597412, "rewards/format_reward_step": 0.83984375, "rewards/step_correlation_reward": 0.24687549471855164, "step": 47 }, { "adv/mean_abs_final_conf": 0.7742482423782349, "adv/mean_abs_reasoning": 0.6331319808959961, "adv/mean_abs_step_conf": 0.7943623661994934, "adv/ratio_final_to_reasoning": 1.222886010721704, "adv/ratio_step_to_reasoning": 1.2546552538308477, "adv/std_final_conf": 0.9287095665931702, "adv/std_reasoning": 0.8592767715454102, "adv/std_step_conf": 0.9367825388908386, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.5647463768115943, "calib/avg_num_step_conf": 6.9296875, "calib/ece": 0.47548936170212774, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.84765625, "calib/frac_conf_gt_0.9": 0.8893617021276595, "calib/gap": 0.010768115942029088, "calib/mean_conf": 0.952936170212766, "calib/mu_c": 0.9584347826086957, "calib/mu_w": 0.9476666666666667, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.4695319148936171, "calib/std_conf": 0.10158476269039882, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.879713216957606, "calib/step_q_c_n": 802.0, "calib/step_q_gap": -0.00426826452387552, "calib/step_q_w": 0.8839814814814815, "calib/step_q_w_n": 972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 754.78515625, "completions/mean_terminated_length": 776.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.0512, "grad_norm": 0.023023847490549088, "kl": 0.19683837890625, "learning_rate": 4.222222222222223e-06, "loss": -0.1132, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01880646124482155, "mask/share_reasoning": 0.8589140176773071, "mask/share_step_conf": 0.09493573009967804, "num_tokens": 14897610.0, "reward": 0.4060549736022949, "reward_std": 0.4451637268066406, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.4412851333618164, "rewards/format_reward_step": 0.84765625, "rewards/step_correlation_reward": 0.11144982278347015, "step": 48 }, { "adv/mean_abs_final_conf": 0.7295511960983276, "adv/mean_abs_reasoning": 0.5492097735404968, "adv/mean_abs_step_conf": 0.7696366906166077, "adv/ratio_final_to_reasoning": 1.3283652827138428, "adv/ratio_step_to_reasoning": 1.4013528667837105, "adv/std_final_conf": 0.9244924783706665, "adv/std_reasoning": 0.8101789951324463, "adv/std_step_conf": 0.9364129304885864, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5738241792929293, "calib/avg_num_step_conf": 7.296875, "calib/ece": 0.33586206896551735, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.8663793103448276, "calib/gap": 0.03034090909090914, "calib/mean_conf": 0.9522413793103449, "calib/mu_c": 0.9637500000000002, "calib/mu_w": 0.9334090909090911, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.33370689655172425, "calib/std_conf": 0.10317449403213547, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.8774825870646766, "calib/step_q_c_n": 1005.0, "calib/step_q_gap": 0.00204805635784, "calib/step_q_w": 0.8754345307068366, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 752.1796875, "completions/mean_terminated_length": 773.3252563476562, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.05226666666666667, "grad_norm": 0.027003340423107147, "kl": 0.1777801513671875, "learning_rate": 4.194444444444445e-06, "loss": -0.0606, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.018772877752780914, "mask/share_reasoning": 0.8578805923461914, "mask/share_step_conf": 0.09600280225276947, "num_tokens": 15194704.0, "reward": 0.5183796882629395, "reward_std": 0.36601462960243225, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5617023706436157, "rewards/format_reward_step": 0.8515625, "rewards/step_correlation_reward": 0.19146324694156647, "step": 49 }, { "adv/mean_abs_final_conf": 0.7670071721076965, "adv/mean_abs_reasoning": 0.6311091184616089, "adv/mean_abs_step_conf": 0.7808622717857361, "adv/ratio_final_to_reasoning": 1.2153321029132849, "adv/ratio_step_to_reasoning": 1.2372856752397514, "adv/std_final_conf": 0.9278801679611206, "adv/std_reasoning": 0.8593429327011108, "adv/std_step_conf": 0.9367603063583374, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5901970284237726, "calib/avg_num_step_conf": 6.640625, "calib/ece": 0.35547826086956513, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.84375, "calib/frac_conf_gt_0.9": 0.8826086956521739, "calib/gap": -0.006333979328165262, "calib/mean_conf": 0.9541739130434784, "calib/mu_c": 0.9518055555555556, "calib/mu_w": 0.9581395348837208, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.34178260869565213, "calib/std_conf": 0.11421933597589791, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.8708485499462945, "calib/step_q_c_n": 931.0, "calib/step_q_gap": -0.004392022225357062, "calib/step_q_w": 0.8752405721716515, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 777.17578125, "completions/mean_terminated_length": 795.8280639648438, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.05333333333333334, "grad_norm": 0.021602502092719078, "kl": 0.1998291015625, "learning_rate": 4.166666666666667e-06, "loss": -0.1236, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.018588896840810776, "mask/share_reasoning": 0.8642673492431641, "mask/share_step_conf": 0.09370625764131546, "num_tokens": 15499021.0, "reward": 0.5040627717971802, "reward_std": 0.42864006757736206, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5358277559280396, "rewards/format_reward_step": 0.84375, "rewards/step_correlation_reward": 0.19104784727096558, "step": 50 }, { "adv/mean_abs_final_conf": 0.7589243650436401, "adv/mean_abs_reasoning": 0.6641005277633667, "adv/mean_abs_step_conf": 0.807522177696228, "adv/ratio_final_to_reasoning": 1.1427853665462846, "adv/ratio_step_to_reasoning": 1.2159637644256858, "adv/std_final_conf": 0.9353416562080383, "adv/std_reasoning": 0.8905836939811707, "adv/std_step_conf": 0.9368466138839722, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.540009250693802, "calib/avg_num_step_conf": 6.625, "calib/ece": 0.3523175965665237, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.8583690987124464, "calib/gap": 0.01629201356768406, "calib/mean_conf": 0.9574678111587984, "calib/mu_c": 0.9639007092198584, "calib/mu_w": 0.9476086956521743, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.3523175965665237, "calib/std_conf": 0.06447563363765581, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.8706469298245614, "calib/step_q_c_n": 912.0, "calib/step_q_gap": -0.004671947726459069, "calib/step_q_w": 0.8753188775510204, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 740.88671875, "completions/mean_terminated_length": 755.6454467773438, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.0544, "grad_norm": 0.023038936778903008, "kl": 0.184295654296875, "learning_rate": 4.138888888888889e-06, "loss": -0.116, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.018784116953611374, "mask/share_reasoning": 0.8652922511100769, "mask/share_step_conf": 0.09639239311218262, "num_tokens": 15797984.0, "reward": 0.49212968349456787, "reward_std": 0.4767027497291565, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5462964773178101, "rewards/format_reward_step": 0.859375, "rewards/step_correlation_reward": 0.15515035390853882, "step": 51 }, { "adv/mean_abs_final_conf": 0.7391918897628784, "adv/mean_abs_reasoning": 0.6569032669067383, "adv/mean_abs_step_conf": 0.7805554866790771, "adv/ratio_final_to_reasoning": 1.1252674891443686, "adv/ratio_step_to_reasoning": 1.1882350507322017, "adv/std_final_conf": 0.9189611673355103, "adv/std_reasoning": 0.8904977440834045, "adv/std_step_conf": 0.9366629123687744, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.5567062818336164, "calib/avg_num_step_conf": 6.96484375, "calib/ece": 0.30367965367965377, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.8744588744588745, "calib/gap": -0.001613752122241463, "calib/mean_conf": 0.9556277056277057, "calib/mu_c": 0.9550967741935481, "calib/mu_w": 0.9567105263157896, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2941558441558442, "calib/std_conf": 0.09776471407915169, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.8801235839340886, "calib/step_q_c_n": 971.0, "calib/step_q_gap": 0.06337481546118218, "calib/step_q_w": 0.8167487684729065, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 760.16796875, "completions/mean_terminated_length": 775.310791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.055466666666666664, "grad_norm": 0.021371837705373764, "kl": 0.197784423828125, "learning_rate": 4.111111111111111e-06, "loss": -0.1177, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.01816929131746292, "mask/share_reasoning": 0.8703060150146484, "mask/share_step_conf": 0.09199343621730804, "num_tokens": 16100539.0, "reward": 0.6138618588447571, "reward_std": 0.44294503331184387, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.5888835787773132, "rewards/format_reward_step": 0.859375, "rewards/step_correlation_reward": 0.3450901210308075, "step": 52 }, { "adv/mean_abs_final_conf": 0.7655702829360962, "adv/mean_abs_reasoning": 0.6462835073471069, "adv/mean_abs_step_conf": 0.7980622053146362, "adv/ratio_final_to_reasoning": 1.1845734483905412, "adv/ratio_step_to_reasoning": 1.2348484778616078, "adv/std_final_conf": 0.9017614126205444, "adv/std_reasoning": 0.843338668346405, "adv/std_step_conf": 0.9365853071212769, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5494510282328336, "calib/avg_num_step_conf": 6.3671875, "calib/ece": 0.3045814977973569, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.87109375, "calib/frac_conf_gt_0.9": 0.9074889867841409, "calib/gap": 0.012929592192401529, "calib/mean_conf": 0.9609691629955949, "calib/mu_c": 0.9652980132450331, "calib/mu_w": 0.9523684210526315, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.30017621145374457, "calib/std_conf": 0.0841224926089066, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.8756695464362853, "calib/step_q_c_n": 926.0, "calib/step_q_gap": -0.000807726290987576, "calib/step_q_w": 0.8764772727272728, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 775.47265625, "completions/mean_terminated_length": 790.9203491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.05653333333333333, "grad_norm": 0.018166445195674896, "kl": 0.19354248046875, "learning_rate": 4.083333333333334e-06, "loss": -0.0789, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.017578979954123497, "mask/share_reasoning": 0.8730362057685852, "mask/share_step_conf": 0.08985357731580734, "num_tokens": 16404884.0, "reward": 0.5861221551895142, "reward_std": 0.4507690966129303, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.5985612869262695, "rewards/format_reward_step": 0.87109375, "rewards/step_correlation_reward": 0.2807142436504364, "step": 53 }, { "adv/mean_abs_final_conf": 0.7066130638122559, "adv/mean_abs_reasoning": 0.6384649872779846, "adv/mean_abs_step_conf": 0.7776536345481873, "adv/ratio_final_to_reasoning": 1.1067373746285007, "adv/ratio_step_to_reasoning": 1.218005137389939, "adv/std_final_conf": 0.9105854630470276, "adv/std_reasoning": 0.8750973343849182, "adv/std_step_conf": 0.9368137121200562, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5397378694924707, "calib/avg_num_step_conf": 7.51953125, "calib/ece": 0.24951965065502194, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.8671875, "calib/frac_conf_gt_0.9": 0.8908296943231441, "calib/gap": 0.04114612381483562, "calib/mean_conf": 0.9551965065502184, "calib/mu_c": 0.9670552147239265, "calib/mu_w": 0.9259090909090909, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.24646288209606995, "calib/std_conf": 0.10126958610938055, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.8770415224913494, "calib/step_q_c_n": 1156.0, "calib/step_q_gap": -0.005130129004099371, "calib/step_q_w": 0.8821716514954487, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 686.09765625, "completions/mean_terminated_length": 708.2297973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.0576, "grad_norm": 0.0214396882802248, "kl": 0.212432861328125, "learning_rate": 4.055555555555556e-06, "loss": -0.0885, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.019736099988222122, "mask/share_reasoning": 0.8435460925102234, "mask/share_step_conf": 0.1054677963256836, "num_tokens": 16686757.0, "reward": 0.6288886070251465, "reward_std": 0.430873841047287, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.649349570274353, "rewards/format_reward_step": 0.8671875, "rewards/step_correlation_reward": 0.3076462745666504, "step": 54 }, { "adv/mean_abs_final_conf": 0.7312518954277039, "adv/mean_abs_reasoning": 0.5973133444786072, "adv/mean_abs_step_conf": 0.7853642702102661, "adv/ratio_final_to_reasoning": 1.2242349885318755, "adv/ratio_step_to_reasoning": 1.314827933227924, "adv/std_final_conf": 0.8862102627754211, "adv/std_reasoning": 0.8268293142318726, "adv/std_step_conf": 0.9367680549621582, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6168327796234773, "calib/avg_num_step_conf": 6.91796875, "calib/ece": 0.412863247863248, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.8418803418803419, "calib/gap": 0.02432115171650051, "calib/mean_conf": 0.9555982905982908, "calib/mu_c": 0.9665116279069768, "calib/mu_w": 0.9421904761904762, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4085897435897437, "calib/std_conf": 0.08376723548421357, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8804960541149945, "calib/step_q_c_n": 887.0, "calib/step_q_gap": -0.0022188780117021745, "calib/step_q_w": 0.8827149321266967, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 737.4765625, "completions/mean_terminated_length": 755.176025390625, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.058666666666666666, "grad_norm": 0.024389022961258888, "kl": 0.2047882080078125, "learning_rate": 4.027777777777779e-06, "loss": -0.0954, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.01948772370815277, "mask/share_reasoning": 0.854324221611023, "mask/share_step_conf": 0.10275053232908249, "num_tokens": 16983375.0, "reward": 0.5285339951515198, "reward_std": 0.42518290877342224, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5391316413879395, "rewards/format_reward_step": 0.90625, "rewards/step_correlation_reward": 0.2359049916267395, "step": 55 }, { "adv/mean_abs_final_conf": 0.7237775325775146, "adv/mean_abs_reasoning": 0.5657122135162354, "adv/mean_abs_step_conf": 0.7897391319274902, "adv/ratio_final_to_reasoning": 1.2794094157501215, "adv/ratio_step_to_reasoning": 1.3960086295800391, "adv/std_final_conf": 0.8957918286323547, "adv/std_reasoning": 0.7931345105171204, "adv/std_step_conf": 0.9366181492805481, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5529282687651331, "calib/avg_num_step_conf": 6.6015625, "calib/ece": 0.44352173913043497, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.8782608695652174, "calib/gap": 0.02880750605326865, "calib/mean_conf": 0.9565652173913044, "calib/mu_c": 0.9705932203389831, "calib/mu_w": 0.9417857142857144, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.44352173913043497, "calib/std_conf": 0.08865631054169681, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8833725029377205, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.008593003533667987, "calib/step_q_w": 0.8747794994040525, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 801.71484375, "completions/mean_terminated_length": 808.0275268554688, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.05973333333333333, "grad_norm": 0.028454406186938286, "kl": 0.1917572021484375, "learning_rate": 4.000000000000001e-06, "loss": -0.0014, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.018351132050156593, "mask/share_reasoning": 0.8761504292488098, "mask/share_step_conf": 0.09768596291542053, "num_tokens": 17295454.0, "reward": 0.49251890182495117, "reward_std": 0.3811336159706116, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.4930843710899353, "rewards/format_reward_step": 0.87890625, "rewards/step_correlation_reward": 0.22242222726345062, "step": 56 }, { "adv/mean_abs_final_conf": 0.761801540851593, "adv/mean_abs_reasoning": 0.6592216491699219, "adv/mean_abs_step_conf": 0.7712447643280029, "adv/ratio_final_to_reasoning": 1.155607589360631, "adv/ratio_step_to_reasoning": 1.169932397243231, "adv/std_final_conf": 0.9170172810554504, "adv/std_reasoning": 0.8751125335693359, "adv/std_step_conf": 0.9366140365600586, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.6095045045045046, "calib/avg_num_step_conf": 7.23828125, "calib/ece": 0.2956053811659194, "calib/final_conf_rate": 0.87109375, "calib/format_rate": 0.85546875, "calib/frac_conf_gt_0.9": 0.8385650224215246, "calib/gap": 0.0452162162162163, "calib/mean_conf": 0.9460089686098656, "calib/mu_c": 0.9612162162162163, "calib/mu_w": 0.916, "calib/nonempty_final_conf_rate": 0.87109375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.28896860986547096, "calib/std_conf": 0.11924177462795925, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.8902629107981221, "calib/step_q_c_n": 1065.0, "calib/step_q_gap": 0.017839052930101795, "calib/step_q_w": 0.8724238578680203, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 734.48046875, "completions/mean_terminated_length": 758.17333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.0608, "grad_norm": 0.019423753023147583, "kl": 0.1970977783203125, "learning_rate": 3.972222222222223e-06, "loss": -0.098, "mask/has_final_conf_rate": 0.87109375, "mask/share_final_conf": 0.01847108080983162, "mask/share_reasoning": 0.8484556674957275, "mask/share_step_conf": 0.10182324051856995, "num_tokens": 17590273.0, "reward": 0.5865829586982727, "reward_std": 0.43412959575653076, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6005644798278809, "rewards/format_reward_step": 0.85546875, "rewards/step_correlation_reward": 0.28275763988494873, "step": 57 }, { "adv/mean_abs_final_conf": 0.7713214159011841, "adv/mean_abs_reasoning": 0.703680157661438, "adv/mean_abs_step_conf": 0.8116668462753296, "adv/ratio_final_to_reasoning": 1.096125004383441, "adv/ratio_step_to_reasoning": 1.1534599028239862, "adv/std_final_conf": 0.9167519807815552, "adv/std_reasoning": 0.8905022740364075, "adv/std_step_conf": 0.9369077086448669, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.5551075268817205, "calib/avg_num_step_conf": 7.24609375, "calib/ece": 0.39802765647743815, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 0.759825327510917, "calib/gap": 0.030838069636456744, "calib/mean_conf": 0.9281586608442505, "calib/mu_c": 0.9422983870967742, "calib/mu_w": 0.9114603174603174, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.39235080058224164, "calib/std_conf": 0.1266060092216613, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8813956310679611, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.002439924634078161, "calib/step_q_w": 0.878955706433883, "calib/step_q_w_n": 1031.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 830.83203125, "completions/mean_terminated_length": 857.633056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 386.0, "epoch": 0.06186666666666667, "grad_norm": 0.02321190945804119, "kl": 0.1953125, "learning_rate": 3.944444444444445e-06, "loss": -0.0503, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.017203208059072495, "mask/share_reasoning": 0.8562976121902466, "mask/share_step_conf": 0.09524921327829361, "num_tokens": 17909286.0, "reward": 0.4562646150588989, "reward_std": 0.4806906580924988, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5300267934799194, "rewards/format_reward_step": 0.875, "rewards/step_correlation_reward": 0.10906495898962021, "step": 58 }, { "adv/mean_abs_final_conf": 0.7684636116027832, "adv/mean_abs_reasoning": 0.6147791743278503, "adv/mean_abs_step_conf": 0.7819816470146179, "adv/ratio_final_to_reasoning": 1.2499831544277, "adv/ratio_step_to_reasoning": 1.2719715951171788, "adv/std_final_conf": 0.9189398288726807, "adv/std_reasoning": 0.8433147072792053, "adv/std_step_conf": 0.9365225434303284, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.5273282442748091, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.37584415584415587, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.7662337662337663, "calib/gap": 0.020382442748091534, "calib/mean_conf": 0.9144588744588745, "calib/mu_c": 0.9232824427480916, "calib/mu_w": 0.9029, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.36160173160173165, "calib/std_conf": 0.1602447716364824, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8843528561099061, "calib/step_q_c_n": 922.0, "calib/step_q_gap": 0.00017689576294210863, "calib/step_q_w": 0.884175960346964, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 772.96484375, "completions/mean_terminated_length": 788.362548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.06293333333333333, "grad_norm": 0.021945487707853317, "kl": 0.2040863037109375, "learning_rate": 3.916666666666667e-06, "loss": -0.105, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.01884847693145275, "mask/share_reasoning": 0.8630433082580566, "mask/share_step_conf": 0.09857693314552307, "num_tokens": 18213413.0, "reward": 0.5048579573631287, "reward_std": 0.44036370515823364, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5499148368835449, "rewards/format_reward_step": 0.890625, "rewards/step_correlation_reward": 0.17698858678340912, "step": 59 }, { "adv/mean_abs_final_conf": 0.7791723608970642, "adv/mean_abs_reasoning": 0.6329578161239624, "adv/mean_abs_step_conf": 0.780436098575592, "adv/ratio_final_to_reasoning": 1.2310020368631742, "adv/ratio_step_to_reasoning": 1.2329985959486858, "adv/std_final_conf": 0.9260181784629822, "adv/std_reasoning": 0.8593480587005615, "adv/std_step_conf": 0.9366816878318787, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5887887413029728, "calib/avg_num_step_conf": 7.30078125, "calib/ece": 0.373849557522124, "calib/final_conf_rate": 0.8828125, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.6858407079646017, "calib/gap": 0.0424019607843138, "calib/mean_conf": 0.9083628318584072, "calib/mu_c": 0.9275000000000001, "calib/mu_w": 0.8850980392156863, "calib/nonempty_final_conf_rate": 0.8828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3667699115044249, "calib/std_conf": 0.15408729636707594, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8931798245614035, "calib/step_q_c_n": 912.0, "calib/step_q_gap": 0.002093095198812067, "calib/step_q_w": 0.8910867293625915, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 726.265625, "completions/mean_terminated_length": 765.1193237304688, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.064, "grad_norm": 0.019068323075771332, "kl": 0.208465576171875, "learning_rate": 3.88888888888889e-06, "loss": -0.1865, "mask/has_final_conf_rate": 0.8828125, "mask/share_final_conf": 0.018185026943683624, "mask/share_reasoning": 0.8298521041870117, "mask/share_step_conf": 0.10118165612220764, "num_tokens": 18508193.0, "reward": 0.5068765878677368, "reward_std": 0.44211721420288086, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5381327867507935, "rewards/format_reward_step": 0.859375, "rewards/step_correlation_reward": 0.20530793070793152, "step": 60 }, { "adv/mean_abs_final_conf": 0.7496904730796814, "adv/mean_abs_reasoning": 0.501232922077179, "adv/mean_abs_step_conf": 0.7808792591094971, "adv/ratio_final_to_reasoning": 1.4956928008097707, "adv/ratio_step_to_reasoning": 1.5579169378448343, "adv/std_final_conf": 0.919006884098053, "adv/std_reasoning": 0.757882297039032, "adv/std_step_conf": 0.9365785121917725, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5127613661730783, "calib/avg_num_step_conf": 8.12109375, "calib/ece": 0.2959583333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.6625, "calib/gap": 0.003881985266760757, "calib/mean_conf": 0.9063750000000002, "calib/mu_c": 0.9078145695364237, "calib/mu_w": 0.9039325842696629, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.28658333333333336, "calib/std_conf": 0.1391513901295995, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8957231588287489, "calib/step_q_c_n": 1127.0, "calib/step_q_gap": -0.006850370583015852, "calib/step_q_w": 0.9025735294117647, "calib/step_q_w_n": 952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 700.57421875, "completions/mean_terminated_length": 711.6944580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.06506666666666666, "grad_norm": 0.020417187362909317, "kl": 0.203521728515625, "learning_rate": 3.861111111111112e-06, "loss": -0.0435, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.020129330456256866, "mask/share_reasoning": 0.8452743887901306, "mask/share_step_conf": 0.11897126585245132, "num_tokens": 18791604.0, "reward": 0.581881582736969, "reward_std": 0.37869957089424133, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6180222630500793, "rewards/format_reward_step": 0.91796875, "rewards/step_correlation_reward": 0.24261584877967834, "step": 61 }, { "adv/mean_abs_final_conf": 0.8006377816200256, "adv/mean_abs_reasoning": 0.7234856486320496, "adv/mean_abs_step_conf": 0.8082146644592285, "adv/ratio_final_to_reasoning": 1.106639479489128, "adv/ratio_step_to_reasoning": 1.117112227433098, "adv/std_final_conf": 0.9264470338821411, "adv/std_reasoning": 0.8750442862510681, "adv/std_step_conf": 0.9368706345558167, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5740320473144901, "calib/avg_num_step_conf": 8.84375, "calib/ece": 0.3516322314049586, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5537190082644629, "calib/gap": 0.03770751667698258, "calib/mean_conf": 0.8761776859504133, "calib/mu_c": 0.8934732824427483, "calib/mu_w": 0.8557657657657657, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3432438016528925, "calib/std_conf": 0.16861234035581443, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.893684749232344, "calib/step_q_c_n": 977.0, "calib/step_q_gap": 0.00502119056878525, "calib/step_q_w": 0.8886635586635587, "calib/step_q_w_n": 1287.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 789.6796875, "completions/mean_terminated_length": 799.0435180664062, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.06613333333333334, "grad_norm": 0.02827797457575798, "kl": 0.198089599609375, "learning_rate": 3.833333333333334e-06, "loss": -0.0186, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018833544105291367, "mask/share_reasoning": 0.8600931763648987, "mask/share_step_conf": 0.10935452580451965, "num_tokens": 19100842.0, "reward": 0.50331711769104, "reward_std": 0.4939599633216858, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5874221324920654, "rewards/format_reward_step": 0.9296875, "rewards/step_correlation_reward": 0.13093087077140808, "step": 62 }, { "adv/mean_abs_final_conf": 0.7669820785522461, "adv/mean_abs_reasoning": 0.5731774568557739, "adv/mean_abs_step_conf": 0.7635728120803833, "adv/ratio_final_to_reasoning": 1.3381232450410874, "adv/ratio_step_to_reasoning": 1.332175232900895, "adv/std_final_conf": 0.9358986616134644, "adv/std_reasoning": 0.8267452120780945, "adv/std_step_conf": 0.9365659356117249, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6514564450287138, "calib/avg_num_step_conf": 6.92578125, "calib/ece": 0.28414634146341466, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.45528455284552843, "calib/gap": 0.07411056528056459, "calib/mean_conf": 0.8356097560975609, "calib/mu_c": 0.8648322147651007, "calib/mu_w": 0.7907216494845362, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2570325203252033, "calib/std_conf": 0.20707661808730868, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8932848837209304, "calib/step_q_c_n": 1032.0, "calib/step_q_gap": 0.0033388648275429755, "calib/step_q_w": 0.8899460188933874, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 775.03125, "completions/mean_terminated_length": 790.4701538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.0672, "grad_norm": 0.022803494706749916, "kl": 0.1893310546875, "learning_rate": 3.8055555555555556e-06, "loss": -0.0539, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019247833639383316, "mask/share_reasoning": 0.8624863028526306, "mask/share_step_conf": 0.09873461723327637, "num_tokens": 19407890.0, "reward": 0.6503553986549377, "reward_std": 0.4144784212112427, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6726309061050415, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.3210486173629761, "step": 63 }, { "adv/mean_abs_final_conf": 0.7718286514282227, "adv/mean_abs_reasoning": 0.6045994162559509, "adv/mean_abs_step_conf": 0.7827744483947754, "adv/ratio_final_to_reasoning": 1.2765950986321775, "adv/ratio_step_to_reasoning": 1.2946993122193091, "adv/std_final_conf": 0.9342676997184753, "adv/std_reasoning": 0.8102007508277893, "adv/std_step_conf": 0.9365555644035339, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5747526283240568, "calib/avg_num_step_conf": 7.97265625, "calib/ece": 0.2109795918367347, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5061224489795918, "calib/gap": 0.055535714285714355, "calib/mean_conf": 0.8552244897959184, "calib/mu_c": 0.8726785714285715, "calib/mu_w": 0.8171428571428572, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1902448979591837, "calib/std_conf": 0.18717788824866435, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8956170328180094, "calib/step_q_c_n": 1229.0, "calib/step_q_gap": -0.016465479497261515, "calib/step_q_w": 0.9120825123152709, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 722.1171875, "completions/mean_terminated_length": 742.4176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.06826666666666667, "grad_norm": 0.01971907913684845, "kl": 0.207366943359375, "learning_rate": 3.777777777777778e-06, "loss": -0.0808, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02002081833779812, "mask/share_reasoning": 0.8429508209228516, "mask/share_step_conf": 0.10968463122844696, "num_tokens": 19696528.0, "reward": 0.7009572982788086, "reward_std": 0.45025748014450073, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.708678126335144, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.3721427321434021, "step": 64 }, { "adv/mean_abs_final_conf": 0.705558180809021, "adv/mean_abs_reasoning": 0.4466238021850586, "adv/mean_abs_step_conf": 0.7446850538253784, "adv/ratio_final_to_reasoning": 1.579759469506896, "adv/ratio_step_to_reasoning": 1.6673653535303927, "adv/std_final_conf": 0.9230350255966187, "adv/std_reasoning": 0.7576389312744141, "adv/std_step_conf": 0.9363834857940674, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5003857212988287, "calib/avg_num_step_conf": 8.44921875, "calib/ece": 0.31627049180327876, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5614754098360656, "calib/gap": -0.0045255628024407635, "calib/mean_conf": 0.8707786885245902, "calib/mu_c": 0.8689795918367346, "calib/mu_w": 0.8735051546391753, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2922950819672132, "calib/std_conf": 0.176158368218379, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8958239700374532, "calib/step_q_c_n": 1068.0, "calib/step_q_gap": -0.015418039094966973, "calib/step_q_w": 0.9112420091324201, "calib/step_q_w_n": 1095.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 678.75, "completions/mean_terminated_length": 697.831298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.06933333333333333, "grad_norm": 0.02425149828195572, "kl": 0.190521240234375, "learning_rate": 3.7500000000000005e-06, "loss": -0.0316, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.021705515682697296, "mask/share_reasoning": 0.8342607617378235, "mask/share_step_conf": 0.11668997257947922, "num_tokens": 19975312.0, "reward": 0.6031345725059509, "reward_std": 0.3473690152168274, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6245855093002319, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2762148082256317, "step": 65 }, { "adv/mean_abs_final_conf": 0.7633180618286133, "adv/mean_abs_reasoning": 0.5400905609130859, "adv/mean_abs_step_conf": 0.7656220197677612, "adv/ratio_final_to_reasoning": 1.4133149458085978, "adv/ratio_step_to_reasoning": 1.4175808191748216, "adv/std_final_conf": 0.9359403848648071, "adv/std_reasoning": 0.7928449511528015, "adv/std_step_conf": 0.9361146688461304, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6635693006100156, "calib/avg_num_step_conf": 7.98046875, "calib/ece": 0.3714086471408645, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.37656903765690375, "calib/gap": 0.08457393483709286, "calib/mean_conf": 0.8112691771269176, "calib/mu_c": 0.8583333333333334, "calib/mu_w": 0.7737593984962405, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3695815899581588, "calib/std_conf": 0.19077935433179818, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8942408376963351, "calib/step_q_c_n": 764.0, "calib/step_q_gap": -0.010505057534313855, "calib/step_q_w": 0.904745895230649, "calib/step_q_w_n": 1279.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 803.25, "completions/mean_terminated_length": 832.5182495117188, "completions/min_length": 0.0, "completions/min_terminated_length": 287.0, "epoch": 0.0704, "grad_norm": 0.028425876051187515, "kl": 0.188751220703125, "learning_rate": 3.7222222222222225e-06, "loss": -0.0473, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018159134313464165, "mask/share_reasoning": 0.8508436679840088, "mask/share_step_conf": 0.0958409234881401, "num_tokens": 20287296.0, "reward": 0.48330575227737427, "reward_std": 0.3188536763191223, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.5817521810531616, "rewards/format_reward_step": 0.9296875, "rewards/step_correlation_reward": 0.11610931158065796, "step": 66 }, { "adv/mean_abs_final_conf": 0.7104264497756958, "adv/mean_abs_reasoning": 0.35727912187576294, "adv/mean_abs_step_conf": 0.748501181602478, "adv/ratio_final_to_reasoning": 1.9884353892437443, "adv/ratio_step_to_reasoning": 2.0950039780459244, "adv/std_final_conf": 0.9293177127838135, "adv/std_reasoning": 0.6814470887184143, "adv/std_step_conf": 0.9362070560455322, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.693968949044586, "calib/avg_num_step_conf": 7.41015625, "calib/ece": 0.1912648221343874, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3241106719367589, "calib/gap": 0.1331303078556264, "calib/mean_conf": 0.8086561264822133, "calib/mu_c": 0.8591719745222931, "calib/mu_w": 0.7260416666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1896837944664032, "calib/std_conf": 0.1835394550318141, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8963327526132404, "calib/step_q_c_n": 1148.0, "calib/step_q_gap": -0.004174590510925169, "calib/step_q_w": 0.9005073431241656, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 772.74609375, "completions/mean_terminated_length": 778.8306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.07146666666666666, "grad_norm": 0.039756689220666885, "kl": 0.189849853515625, "learning_rate": 3.694444444444445e-06, "loss": 0.0167, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01978326216340065, "mask/share_reasoning": 0.8672667145729065, "mask/share_step_conf": 0.10513752698898315, "num_tokens": 20590127.0, "reward": 0.6821687817573547, "reward_std": 0.2857590317726135, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7492746114730835, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2947504222393036, "step": 67 }, { "adv/mean_abs_final_conf": 0.7532162666320801, "adv/mean_abs_reasoning": 0.5838368535041809, "adv/mean_abs_step_conf": 0.7709550857543945, "adv/ratio_final_to_reasoning": 1.290114288111972, "adv/ratio_step_to_reasoning": 1.320497466247861, "adv/std_final_conf": 0.9340994954109192, "adv/std_reasoning": 0.826541006565094, "adv/std_step_conf": 0.9367088079452515, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6727183833116037, "calib/avg_num_step_conf": 7.7109375, "calib/ece": 0.31544354838709676, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3870967741935484, "calib/gap": 0.09035071707953068, "calib/mean_conf": 0.8143951612903227, "calib/mu_c": 0.8573846153846152, "calib/mu_w": 0.7670338983050845, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30282258064516127, "calib/std_conf": 0.20217077253379004, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8955928853754942, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.0017571265397352542, "calib/step_q_w": 0.8938357588357589, "calib/step_q_w_n": 962.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 742.51953125, "completions/mean_terminated_length": 751.3241577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.07253333333333334, "grad_norm": 0.9252099990844727, "kl": 0.35418701171875, "learning_rate": 3.6666666666666666e-06, "loss": -0.0128, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02056063711643219, "mask/share_reasoning": 0.853926956653595, "mask/share_step_conf": 0.11379365622997284, "num_tokens": 20884300.0, "reward": 0.5387983322143555, "reward_std": 0.3924047648906708, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6455503702163696, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.13829626142978668, "step": 68 }, { "adv/mean_abs_final_conf": 0.7773676514625549, "adv/mean_abs_reasoning": 0.5483871102333069, "adv/mean_abs_step_conf": 0.7789148092269897, "adv/ratio_final_to_reasoning": 1.4175527414053006, "adv/ratio_step_to_reasoning": 1.4203740290241444, "adv/std_final_conf": 0.9361997246742249, "adv/std_reasoning": 0.7928118109703064, "adv/std_step_conf": 0.9365246295928955, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6481659304782024, "calib/avg_num_step_conf": 7.12890625, "calib/ece": 0.26452, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.22, "calib/gap": 0.10831380833493387, "calib/mean_conf": 0.7264400000000001, "calib/mu_c": 0.7814634146341464, "calib/mu_w": 0.6731496062992125, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24947999999999998, "calib/std_conf": 0.22698838384375533, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8946553672316385, "calib/step_q_c_n": 885.0, "calib/step_q_gap": 0.004868133189085189, "calib/step_q_w": 0.8897872340425533, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 792.01953125, "completions/mean_terminated_length": 801.4110717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 0.0736, "grad_norm": 0.03803575411438942, "kl": 0.18316650390625, "learning_rate": 3.638888888888889e-06, "loss": -0.0205, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018899952992796898, "mask/share_reasoning": 0.8710591793060303, "mask/share_step_conf": 0.09832212328910828, "num_tokens": 21191553.0, "reward": 0.5194801688194275, "reward_std": 0.36671119928359985, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.681368350982666, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.06618569791316986, "step": 69 }, { "adv/mean_abs_final_conf": 0.7430238723754883, "adv/mean_abs_reasoning": 0.5215783715248108, "adv/mean_abs_step_conf": 0.7823535203933716, "adv/ratio_final_to_reasoning": 1.4245680283929174, "adv/ratio_step_to_reasoning": 1.4999730876612012, "adv/std_final_conf": 0.919750452041626, "adv/std_reasoning": 0.7754024267196655, "adv/std_step_conf": 0.9365573525428772, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7753010845698316, "calib/avg_num_step_conf": 7.51953125, "calib/ece": 0.21260162601626023, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3170731707317073, "calib/gap": 0.20759198882161145, "calib/mean_conf": 0.7467479674796748, "calib/mu_c": 0.8421052631578947, "calib/mu_w": 0.6345132743362832, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20934959349593502, "calib/std_conf": 0.23748682782457178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8947928994082841, "calib/step_q_c_n": 1014.0, "calib/step_q_gap": -0.009140507104705264, "calib/step_q_w": 0.9039334065129894, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 795.75390625, "completions/mean_terminated_length": 808.3849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.07466666666666667, "grad_norm": 0.042088743299245834, "kl": 0.1686248779296875, "learning_rate": 3.6111111111111115e-06, "loss": 0.0327, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01939772255718708, "mask/share_reasoning": 0.8597534894943237, "mask/share_step_conf": 0.10522376000881195, "num_tokens": 21502258.0, "reward": 0.6089703440666199, "reward_std": 0.3306017518043518, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7263593673706055, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.19548749923706055, "step": 70 }, { "adv/mean_abs_final_conf": 0.7810891270637512, "adv/mean_abs_reasoning": 0.5818857550621033, "adv/mean_abs_step_conf": 0.7901183366775513, "adv/ratio_final_to_reasoning": 1.3423410356219967, "adv/ratio_step_to_reasoning": 1.3578581874602238, "adv/std_final_conf": 0.9348444938659668, "adv/std_reasoning": 0.8429794311523438, "adv/std_step_conf": 0.9366829991340637, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6373737373737374, "calib/avg_num_step_conf": 7.73046875, "calib/ece": 0.23112925170068033, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3306122448979592, "calib/gap": 0.11414590347923692, "calib/mean_conf": 0.6934421768707483, "calib/mu_c": 0.7446913580246913, "calib/mu_w": 0.6305454545454544, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1867755102040817, "calib/std_conf": 0.28514449046638407, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8991023166023168, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.014489379168594496, "calib/step_q_w": 0.8846129374337223, "calib/step_q_w_n": 943.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 772.3671875, "completions/mean_terminated_length": 794.0802612304688, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.07573333333333333, "grad_norm": 0.028518889099359512, "kl": 0.1798095703125, "learning_rate": 3.5833333333333335e-06, "loss": -0.0375, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019431080669164658, "mask/share_reasoning": 0.8467603325843811, "mask/share_step_conf": 0.10646484792232513, "num_tokens": 21804392.0, "reward": 0.607774019241333, "reward_std": 0.3878346085548401, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6770904660224915, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.24158260226249695, "step": 71 }, { "adv/mean_abs_final_conf": 0.7352126836776733, "adv/mean_abs_reasoning": 0.49225544929504395, "adv/mean_abs_step_conf": 0.7592939138412476, "adv/ratio_final_to_reasoning": 1.4935592581668053, "adv/ratio_step_to_reasoning": 1.5424794482796031, "adv/std_final_conf": 0.9240669012069702, "adv/std_reasoning": 0.7753722667694092, "adv/std_step_conf": 0.9365233778953552, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7154524241638581, "calib/avg_num_step_conf": 7.4140625, "calib/ece": 0.26682730923694775, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3534136546184739, "calib/gap": 0.19477119522945296, "calib/mean_conf": 0.723293172690763, "calib/mu_c": 0.8273275862068965, "calib/mu_w": 0.6325563909774435, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26212851405622484, "calib/std_conf": 0.26641227523362154, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9019730941704036, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.004030748245950289, "calib/step_q_w": 0.8979423459244533, "calib/step_q_w_n": 1006.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 753.2734375, "completions/mean_terminated_length": 762.20556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.0768, "grad_norm": 0.03393369913101196, "kl": 0.193572998046875, "learning_rate": 3.555555555555556e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019757676869630814, "mask/share_reasoning": 0.8599628210067749, "mask/share_step_conf": 0.1085607260465622, "num_tokens": 22101638.0, "reward": 0.5681977868080139, "reward_std": 0.36324363946914673, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6912593841552734, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.16076117753982544, "step": 72 }, { "adv/mean_abs_final_conf": 0.7578251361846924, "adv/mean_abs_reasoning": 0.6473281383514404, "adv/mean_abs_step_conf": 0.7835944890975952, "adv/ratio_final_to_reasoning": 1.1706970410936504, "adv/ratio_step_to_reasoning": 1.2105058357160037, "adv/std_final_conf": 0.9243986010551453, "adv/std_reasoning": 0.8267653584480286, "adv/std_step_conf": 0.9366769194602966, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6147632129774986, "calib/avg_num_step_conf": 7.69921875, "calib/ece": 0.19759842519685045, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.38188976377952755, "calib/gap": 0.10122710622710618, "calib/mean_conf": 0.7414566929133858, "calib/mu_c": 0.7805128205128204, "calib/mu_w": 0.6792857142857143, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16244094488188981, "calib/std_conf": 0.2645569839010028, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8973987941429803, "calib/step_q_c_n": 1161.0, "calib/step_q_gap": -0.006317255239735919, "calib/step_q_w": 0.9037160493827162, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 726.265625, "completions/mean_terminated_length": 726.265625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.07786666666666667, "grad_norm": 0.024225082248449326, "kl": 0.1775665283203125, "learning_rate": 3.5277777777777784e-06, "loss": 0.1016, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020709872245788574, "mask/share_reasoning": 0.8673610091209412, "mask/share_step_conf": 0.11192911863327026, "num_tokens": 22394594.0, "reward": 0.6562621593475342, "reward_std": 0.42811983823776245, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7120077610015869, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2825477719306946, "step": 73 }, { "adv/mean_abs_final_conf": 0.7497447729110718, "adv/mean_abs_reasoning": 0.6239073872566223, "adv/mean_abs_step_conf": 0.7768936157226562, "adv/ratio_final_to_reasoning": 1.2016924117660601, "adv/ratio_step_to_reasoning": 1.2452066309692666, "adv/std_final_conf": 0.917289137840271, "adv/std_reasoning": 0.8591247797012329, "adv/std_step_conf": 0.9366591572761536, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6838306395911037, "calib/avg_num_step_conf": 8.015625, "calib/ece": 0.23120331950207476, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.3900414937759336, "calib/gap": 0.1913862411935351, "calib/mean_conf": 0.6926970954356846, "calib/mu_c": 0.7832283464566929, "calib/mu_w": 0.5918421052631578, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1984647302904565, "calib/std_conf": 0.3053855931948699, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9011902339776197, "calib/step_q_c_n": 983.0, "calib/step_q_gap": -0.0033934891280865687, "calib/step_q_w": 0.9045837231057062, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 732.9140625, "completions/mean_terminated_length": 753.51806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.07893333333333333, "grad_norm": 0.040199581533670425, "kl": 0.211181640625, "learning_rate": 3.5e-06, "loss": -0.0316, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02010929211974144, "mask/share_reasoning": 0.8409967422485352, "mask/share_step_conf": 0.11155019700527191, "num_tokens": 22686148.0, "reward": 0.5797815322875977, "reward_std": 0.39971089363098145, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.68291175365448, "rewards/format_reward_step": 0.94140625, "rewards/step_correlation_reward": 0.18915137648582458, "step": 74 }, { "adv/mean_abs_final_conf": 0.7649242877960205, "adv/mean_abs_reasoning": 0.5034365653991699, "adv/mean_abs_step_conf": 0.7570182681083679, "adv/ratio_final_to_reasoning": 1.5194055028353364, "adv/ratio_step_to_reasoning": 1.5037013998141664, "adv/std_final_conf": 0.9224418997764587, "adv/std_reasoning": 0.7577688694000244, "adv/std_step_conf": 0.9364383220672607, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.741899908731366, "calib/avg_num_step_conf": 7.64453125, "calib/ece": 0.14323962516733602, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.42971887550200805, "calib/gap": 0.28359851941993697, "calib/mean_conf": 0.6899330655957163, "calib/mu_c": 0.7764932562620422, "calib/mu_w": 0.4928947368421053, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06919678714859441, "calib/std_conf": 0.32527041573984733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.89967542503864, "calib/step_q_c_n": 1294.0, "calib/step_q_gap": 0.0060354049280315625, "calib/step_q_w": 0.8936400201106084, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2011.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 680.83203125, "completions/mean_terminated_length": 697.1720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.08, "grad_norm": 0.06415676325559616, "kl": 0.205810546875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0964, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021222256124019623, "mask/share_reasoning": 0.8401088714599609, "mask/share_step_conf": 0.11523135751485825, "num_tokens": 22965193.0, "reward": 0.7654494047164917, "reward_std": 0.35930517315864563, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7767044901847839, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.4252881109714508, "step": 75 }, { "adv/mean_abs_final_conf": 0.7344155311584473, "adv/mean_abs_reasoning": 0.45388472080230713, "adv/mean_abs_step_conf": 0.7792856097221375, "adv/ratio_final_to_reasoning": 1.6180662126283105, "adv/ratio_step_to_reasoning": 1.7169240866813869, "adv/std_final_conf": 0.9143033027648926, "adv/std_reasoning": 0.7392937541007996, "adv/std_step_conf": 0.9359453916549683, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7301700680272109, "calib/avg_num_step_conf": 7.4296875, "calib/ece": 0.14139784946236564, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3870967741935484, "calib/gap": 0.2686938775510206, "calib/mean_conf": 0.639489247311828, "calib/mu_c": 0.7456666666666668, "calib/mu_w": 0.4769727891156462, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08802419354838713, "calib/std_conf": 0.332289102101497, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9007261592300965, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.0097907178598724, "calib/step_q_w": 0.8909354413702241, "calib/step_q_w_n": 759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 757.26953125, "completions/mean_terminated_length": 769.2897338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.08106666666666666, "grad_norm": 0.09344251453876495, "kl": 0.189422607421875, "learning_rate": 3.444444444444445e-06, "loss": 0.0008, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020103726536035538, "mask/share_reasoning": 0.8557681441307068, "mask/share_step_conf": 0.10850309580564499, "num_tokens": 23262110.0, "reward": 0.6588497757911682, "reward_std": 0.3028367757797241, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7497581839561462, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.25700393319129944, "step": 76 }, { "adv/mean_abs_final_conf": 0.7352517247200012, "adv/mean_abs_reasoning": 0.5148537755012512, "adv/mean_abs_step_conf": 0.7778766751289368, "adv/ratio_final_to_reasoning": 1.4280787277983442, "adv/ratio_step_to_reasoning": 1.5108691285629823, "adv/std_final_conf": 0.8905107975006104, "adv/std_reasoning": 0.7576778531074524, "adv/std_step_conf": 0.9358101487159729, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6792625045637094, "calib/avg_num_step_conf": 8.6640625, "calib/ece": 0.22512768817204304, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5201612903225806, "calib/gap": 0.1941892418157477, "calib/mean_conf": 0.7020900537634409, "calib/mu_c": 0.767080808080808, "calib/mu_w": 0.5728915662650603, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13094758064516132, "calib/std_conf": 0.34288262180861306, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.902955700798838, "calib/step_q_c_n": 1377.0, "calib/step_q_gap": 0.0015288280283265454, "calib/step_q_w": 0.9014268727705115, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 763.625, "completions/mean_terminated_length": 778.836669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.08213333333333334, "grad_norm": 0.0820995420217514, "kl": 0.18267822265625, "learning_rate": 3.416666666666667e-06, "loss": -0.0688, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020010104402899742, "mask/share_reasoning": 0.841918408870697, "mask/share_step_conf": 0.11854026466608047, "num_tokens": 23562262.0, "reward": 0.6911901831626892, "reward_std": 0.33868926763534546, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7139571905136108, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.34732943773269653, "step": 77 }, { "adv/mean_abs_final_conf": 0.7347139716148376, "adv/mean_abs_reasoning": 0.5040462017059326, "adv/mean_abs_step_conf": 0.7889659404754639, "adv/ratio_final_to_reasoning": 1.4576321954777465, "adv/ratio_step_to_reasoning": 1.5652651241200253, "adv/std_final_conf": 0.9141965508460999, "adv/std_reasoning": 0.739403486251831, "adv/std_step_conf": 0.9360442757606506, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6120240752011903, "calib/avg_num_step_conf": 8.15625, "calib/ece": 0.2332142857142857, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.17125786163522017, "calib/mean_conf": 0.7713888888888889, "calib/mu_c": 0.8345911949685534, "calib/mu_w": 0.6633333333333332, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1868253968253968, "calib/std_conf": 0.3135410525315225, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9005529225908372, "calib/step_q_c_n": 1266.0, "calib/step_q_gap": -0.005360702713299137, "calib/step_q_w": 0.9059136253041363, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 777.8984375, "completions/mean_terminated_length": 790.24609375, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.0832, "grad_norm": 0.07548096030950546, "kl": 0.18994140625, "learning_rate": 3.3888888888888893e-06, "loss": -0.0264, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01880320906639099, "mask/share_reasoning": 0.8600537776947021, "mask/share_step_conf": 0.10551798343658447, "num_tokens": 23869428.0, "reward": 0.6681511402130127, "reward_std": 0.33723169565200806, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7137347459793091, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.30225497484207153, "step": 78 }, { "adv/mean_abs_final_conf": 0.6985340118408203, "adv/mean_abs_reasoning": 0.5203701257705688, "adv/mean_abs_step_conf": 0.7808733582496643, "adv/ratio_final_to_reasoning": 1.3423791590772525, "adv/ratio_step_to_reasoning": 1.5006114294000639, "adv/std_final_conf": 0.8765206336975098, "adv/std_reasoning": 0.7576753497123718, "adv/std_step_conf": 0.9352340698242188, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6553852414541509, "calib/avg_num_step_conf": 7.7421875, "calib/ece": 0.3181392235609105, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8152610441767069, "calib/gap": 0.10126853861457752, "calib/mean_conf": 0.8972824631860777, "calib/mu_c": 0.9367324561403508, "calib/mu_w": 0.8354639175257733, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3024899598393576, "calib/std_conf": 0.22779343355685486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9020017482517484, "calib/step_q_c_n": 1144.0, "calib/step_q_gap": 0.002144946342440446, "calib/step_q_w": 0.899856801909308, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 762.0703125, "completions/mean_terminated_length": 777.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.08426666666666667, "grad_norm": 0.07747872173786163, "kl": 0.3189697265625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0463, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01893690787255764, "mask/share_reasoning": 0.8566397428512573, "mask/share_step_conf": 0.10489208251237869, "num_tokens": 24170894.0, "reward": 0.6061649322509766, "reward_std": 0.3817436099052429, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6577041149139404, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.24134458601474762, "step": 79 }, { "adv/mean_abs_final_conf": 0.6666533946990967, "adv/mean_abs_reasoning": 0.5015234351158142, "adv/mean_abs_step_conf": 0.7864340543746948, "adv/ratio_final_to_reasoning": 1.3292567166779552, "adv/ratio_step_to_reasoning": 1.5680903409689873, "adv/std_final_conf": 0.8309153318405151, "adv/std_reasoning": 0.7394514679908752, "adv/std_step_conf": 0.9357711672782898, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5628825197790714, "calib/avg_num_step_conf": 8.39453125, "calib/ece": 0.261593625498008, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8924302788844621, "calib/gap": 0.08233318405732204, "calib/mean_conf": 0.935776892430279, "calib/mu_c": 0.9610344827586207, "calib/mu_w": 0.8787012987012986, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25207171314741045, "calib/std_conf": 0.1765681796614607, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9052186379928316, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": -0.011293298346691016, "calib/step_q_w": 0.9165119363395227, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 728.54296875, "completions/mean_terminated_length": 731.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.08533333333333333, "grad_norm": 0.02758839726448059, "kl": 0.171112060546875, "learning_rate": 3.3333333333333333e-06, "loss": 0.0601, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020696930587291718, "mask/share_reasoning": 0.854810357093811, "mask/share_step_conf": 0.12058646976947784, "num_tokens": 24459561.0, "reward": 0.6907826662063599, "reward_std": 0.3672957420349121, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.714138925075531, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.336176335811615, "step": 80 }, { "adv/mean_abs_final_conf": 0.6626981496810913, "adv/mean_abs_reasoning": 0.512515664100647, "adv/mean_abs_step_conf": 0.7866781949996948, "adv/ratio_final_to_reasoning": 1.293030040055423, "adv/ratio_step_to_reasoning": 1.5349349299989556, "adv/std_final_conf": 0.8433207869529724, "adv/std_reasoning": 0.7756554484367371, "adv/std_step_conf": 0.9362679719924927, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5691447696361177, "calib/avg_num_step_conf": 7.8984375, "calib/ece": 0.36096994535519134, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8934426229508197, "calib/gap": 0.021494694865769493, "calib/mean_conf": 0.9522267759562841, "calib/mu_c": 0.960419426048565, "calib/mu_w": 0.9389247311827955, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3471721311475411, "calib/std_conf": 0.1373709214728154, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9022589285714286, "calib/step_q_c_n": 1120.0, "calib/step_q_gap": -0.0017322022489705269, "calib/step_q_w": 0.9039911308203992, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 751.546875, "completions/mean_terminated_length": 772.6746826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.0864, "grad_norm": 0.02237076126039028, "kl": 0.1680908203125, "learning_rate": 3.3055555555555558e-06, "loss": -0.0794, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019753076136112213, "mask/share_reasoning": 0.8459489941596985, "mask/share_step_conf": 0.1069541722536087, "num_tokens": 24758205.0, "reward": 0.5961042046546936, "reward_std": 0.4187864065170288, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6139017343521118, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.2712753117084503, "step": 81 }, { "adv/mean_abs_final_conf": 0.6603360176086426, "adv/mean_abs_reasoning": 0.5264520049095154, "adv/mean_abs_step_conf": 0.7618018984794617, "adv/ratio_final_to_reasoning": 1.2543138053432594, "adv/ratio_step_to_reasoning": 1.4470490973064056, "adv/std_final_conf": 0.8467787504196167, "adv/std_reasoning": 0.7754591107368469, "adv/std_step_conf": 0.9360379576683044, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.527682119205298, "calib/avg_num_step_conf": 8.07421875, "calib/ece": 0.3805843293492698, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9561752988047809, "calib/gap": 0.017545253863134658, "calib/mean_conf": 0.9545551128818063, "calib/mu_c": 0.9615452538631346, "calib/mu_w": 0.944, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36677290836653403, "calib/std_conf": 0.16185023037746563, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9057720891824939, "calib/step_q_c_n": 1211.0, "calib/step_q_gap": -0.002510621097880028, "calib/step_q_w": 0.9082827102803739, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 700.9140625, "completions/mean_terminated_length": 706.4330444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.08746666666666666, "grad_norm": 0.01994318887591362, "kl": 0.177215576171875, "learning_rate": 3.277777777777778e-06, "loss": 0.016, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.021094955503940582, "mask/share_reasoning": 0.8498541116714478, "mask/share_step_conf": 0.12123844772577286, "num_tokens": 25043191.0, "reward": 0.5797907710075378, "reward_std": 0.3987249433994293, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6056525707244873, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.241428941488266, "step": 82 }, { "adv/mean_abs_final_conf": 0.5970791578292847, "adv/mean_abs_reasoning": 0.42100244760513306, "adv/mean_abs_step_conf": 0.7423392534255981, "adv/ratio_final_to_reasoning": 1.418232034577855, "adv/ratio_step_to_reasoning": 1.7632658851471894, "adv/std_final_conf": 0.8194170594215393, "adv/std_reasoning": 0.7205320000648499, "adv/std_step_conf": 0.9362738132476807, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5783152682992104, "calib/avg_num_step_conf": 7.3359375, "calib/ece": 0.39963562753036447, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9433198380566802, "calib/gap": 0.04008764886926253, "calib/mean_conf": 0.9599595141700406, "calib/mu_c": 0.9771631205673759, "calib/mu_w": 0.9370754716981133, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3943724696356276, "calib/std_conf": 0.1428866598083284, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9062692307692307, "calib/step_q_c_n": 1040.0, "calib/step_q_gap": 0.00563677253534034, "calib/step_q_w": 0.9006324582338904, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 819.76171875, "completions/mean_terminated_length": 829.4822387695312, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.08853333333333334, "grad_norm": 0.020195307210087776, "kl": 0.155517578125, "learning_rate": 3.2500000000000002e-06, "loss": 0.0086, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01880960538983345, "mask/share_reasoning": 0.8670693635940552, "mask/share_step_conf": 0.10240228474140167, "num_tokens": 25360314.0, "reward": 0.5373420715332031, "reward_std": 0.3253489136695862, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5816448926925659, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1899142861366272, "step": 83 }, { "adv/mean_abs_final_conf": 0.6476809978485107, "adv/mean_abs_reasoning": 0.5205007791519165, "adv/mean_abs_step_conf": 0.7634508013725281, "adv/ratio_final_to_reasoning": 1.2443420332699917, "adv/ratio_step_to_reasoning": 1.4667620721268944, "adv/std_final_conf": 0.8201258182525635, "adv/std_reasoning": 0.7755391597747803, "adv/std_step_conf": 0.9361568093299866, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5499315068493151, "calib/avg_num_step_conf": 7.78125, "calib/ece": 0.37711382113821135, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9715447154471545, "calib/gap": 0.04365616438356179, "calib/mean_conf": 0.9706097560975612, "calib/mu_c": 0.9883561643835618, "calib/mu_w": 0.9447, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37711382113821135, "calib/std_conf": 0.11152688502634846, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9081099656357388, "calib/step_q_c_n": 1164.0, "calib/step_q_gap": -0.001467329050251709, "calib/step_q_w": 0.9095772946859905, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 742.08984375, "completions/mean_terminated_length": 747.9330444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.0896, "grad_norm": 0.020287124440073967, "kl": 0.1703643798828125, "learning_rate": 3.2222222222222227e-06, "loss": 0.0337, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.020629458129405975, "mask/share_reasoning": 0.8547236323356628, "mask/share_step_conf": 0.11683438718318939, "num_tokens": 25656209.0, "reward": 0.5711010694503784, "reward_std": 0.41056394577026367, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6007331609725952, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.2344377040863037, "step": 84 }, { "adv/mean_abs_final_conf": 0.6982238292694092, "adv/mean_abs_reasoning": 0.469634473323822, "adv/mean_abs_step_conf": 0.7610514163970947, "adv/ratio_final_to_reasoning": 1.4867388765729947, "adv/ratio_step_to_reasoning": 1.6205186365702229, "adv/std_final_conf": 0.8545514345169067, "adv/std_reasoning": 0.7393996119499207, "adv/std_step_conf": 0.9361210465431213, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5861647276573242, "calib/avg_num_step_conf": 7.80078125, "calib/ece": 0.4820731707317073, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9634146341463414, "calib/gap": 0.020375462718138504, "calib/mean_conf": 0.9780081300813009, "calib/mu_c": 0.9882786885245902, "calib/mu_w": 0.9679032258064517, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4820731707317073, "calib/std_conf": 0.06663588670766235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9076089266737514, "calib/step_q_c_n": 941.0, "calib/step_q_gap": -0.002088043023218411, "calib/step_q_w": 0.9096969696969698, "calib/step_q_w_n": 1056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 774.8515625, "completions/mean_terminated_length": 790.286865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.09066666666666667, "grad_norm": 0.02755478024482727, "kl": 0.1668548583984375, "learning_rate": 3.1944444444444443e-06, "loss": -0.0236, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019106771796941757, "mask/share_reasoning": 0.8527930378913879, "mask/share_step_conf": 0.1085689514875412, "num_tokens": 25962395.0, "reward": 0.4849882423877716, "reward_std": 0.3687390685081482, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.49901872873306274, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.1842389702796936, "step": 85 }, { "adv/mean_abs_final_conf": 0.6831027269363403, "adv/mean_abs_reasoning": 0.44067060947418213, "adv/mean_abs_step_conf": 0.7617740035057068, "adv/ratio_final_to_reasoning": 1.5501436044292438, "adv/ratio_step_to_reasoning": 1.7286698661720878, "adv/std_final_conf": 0.8571823835372925, "adv/std_reasoning": 0.7014427781105042, "adv/std_step_conf": 0.9361329078674316, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.548220472440945, "calib/avg_num_step_conf": 7.8046875, "calib/ece": 0.4785714285714287, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9722222222222222, "calib/gap": 0.010436535433070881, "calib/mean_conf": 0.9825396825396826, "calib/mu_c": 0.9877165354330708, "calib/mu_w": 0.9772799999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4785714285714287, "calib/std_conf": 0.06547065873698518, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9088988095238096, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.003888708513708594, "calib/step_q_w": 0.905010101010101, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 766.34375, "completions/mean_terminated_length": 775.4308471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 304.0, "epoch": 0.09173333333333333, "grad_norm": 0.024390293285250664, "kl": 0.162445068359375, "learning_rate": 3.1666666666666667e-06, "loss": -0.0033, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019972484558820724, "mask/share_reasoning": 0.8537373542785645, "mask/share_step_conf": 0.11457139253616333, "num_tokens": 26264091.0, "reward": 0.46634215116500854, "reward_std": 0.3684176504611969, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5136839747428894, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.12368777394294739, "step": 86 }, { "adv/mean_abs_final_conf": 0.6265145540237427, "adv/mean_abs_reasoning": 0.4753897190093994, "adv/mean_abs_step_conf": 0.7652829885482788, "adv/ratio_final_to_reasoning": 1.3178967255102865, "adv/ratio_step_to_reasoning": 1.609801301851771, "adv/std_final_conf": 0.8234379291534424, "adv/std_reasoning": 0.7394675612449646, "adv/std_step_conf": 0.936168909072876, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5356804172593645, "calib/avg_num_step_conf": 7.8515625, "calib/ece": 0.28661224489795933, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9836734693877551, "calib/gap": 0.008659712343923043, "calib/mean_conf": 0.9825306122448979, "calib/mu_c": 0.9851461988304095, "calib/mu_w": 0.9764864864864865, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.285591836734694, "calib/std_conf": 0.05303949319517171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9073399390243905, "calib/step_q_c_n": 1312.0, "calib/step_q_gap": -0.008734559542944886, "calib/step_q_w": 0.9160744985673354, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 695.91796875, "completions/mean_terminated_length": 721.2753295898438, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.0928, "grad_norm": 0.016792908310890198, "kl": 0.1767578125, "learning_rate": 3.138888888888889e-06, "loss": -0.1016, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02021612599492073, "mask/share_reasoning": 0.8318778276443481, "mask/share_step_conf": 0.11274976283311844, "num_tokens": 26547742.0, "reward": 0.6892440319061279, "reward_std": 0.40235090255737305, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6785781383514404, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.374909907579422, "step": 87 }, { "adv/mean_abs_final_conf": 0.5426766872406006, "adv/mean_abs_reasoning": 0.371280699968338, "adv/mean_abs_step_conf": 0.7295685410499573, "adv/ratio_final_to_reasoning": 1.461634518807142, "adv/ratio_step_to_reasoning": 1.965005294140453, "adv/std_final_conf": 0.8003331422805786, "adv/std_reasoning": 0.7011435031890869, "adv/std_step_conf": 0.9352799654006958, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5501409443269908, "calib/avg_num_step_conf": 7.203125, "calib/ece": 0.3319521912350598, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0030380549682875735, "calib/mean_conf": 0.9893227091633466, "calib/mu_c": 0.9903636363636362, "calib/mu_w": 0.9873255813953487, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3319521912350598, "calib/std_conf": 0.010929848848345757, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9066103059581321, "calib/step_q_c_n": 1242.0, "calib/step_q_gap": 0.005314624895009112, "calib/step_q_w": 0.901295681063123, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 754.734375, "completions/mean_terminated_length": 760.6771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.09386666666666667, "grad_norm": 0.014357146807014942, "kl": 0.1505279541015625, "learning_rate": 3.1111111111111116e-06, "loss": 0.0539, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019829383119940758, "mask/share_reasoning": 0.8680585026741028, "mask/share_step_conf": 0.1042996495962143, "num_tokens": 26850802.0, "reward": 0.6862862706184387, "reward_std": 0.31181150674819946, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6488343477249146, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3995193541049957, "step": 88 }, { "adv/mean_abs_final_conf": 0.5844972133636475, "adv/mean_abs_reasoning": 0.4211689233779907, "adv/mean_abs_step_conf": 0.7886731624603271, "adv/ratio_final_to_reasoning": 1.3877975817296302, "adv/ratio_step_to_reasoning": 1.8725815668800156, "adv/std_final_conf": 0.7998574376106262, "adv/std_reasoning": 0.7013557553291321, "adv/std_step_conf": 0.9361746907234192, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5437568069703377, "calib/avg_num_step_conf": 7.62109375, "calib/ece": 0.46563999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": 0.014389775129732829, "calib/mean_conf": 0.9816400000000001, "calib/mu_c": 0.9886046511627907, "calib/mu_w": 0.9742148760330579, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46563999999999994, "calib/std_conf": 0.06436855132749222, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9093837972166999, "calib/step_q_c_n": 1006.0, "calib/step_q_gap": 0.010695966528869283, "calib/step_q_w": 0.8986878306878306, "calib/step_q_w_n": 945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 836.5859375, "completions/mean_terminated_length": 839.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.09493333333333333, "grad_norm": 0.020895086228847504, "kl": 0.14752197265625, "learning_rate": 3.0833333333333336e-06, "loss": 0.0347, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018971944227814674, "mask/share_reasoning": 0.8709502220153809, "mask/share_step_conf": 0.10617159307003021, "num_tokens": 27173856.0, "reward": 0.5042704939842224, "reward_std": 0.335740864276886, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5199999809265137, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.19322849810123444, "step": 89 }, { "adv/mean_abs_final_conf": 0.6049848794937134, "adv/mean_abs_reasoning": 0.4883537292480469, "adv/mean_abs_step_conf": 0.7501746416091919, "adv/ratio_final_to_reasoning": 1.2388251450956498, "adv/ratio_step_to_reasoning": 1.5361296467711825, "adv/std_final_conf": 0.8281879425048828, "adv/std_reasoning": 0.7753020524978638, "adv/std_step_conf": 0.9353448152542114, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5343933875035025, "calib/avg_num_step_conf": 7.7421875, "calib/ece": 0.33662698412698394, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": -0.0034617539927148666, "calib/mean_conf": 0.9802777777777779, "calib/mu_c": 0.9790963855421685, "calib/mu_w": 0.9825581395348834, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32908730158730143, "calib/std_conf": 0.08621863486646218, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9016487730061351, "calib/step_q_c_n": 1304.0, "calib/step_q_gap": -0.0054898700617114216, "calib/step_q_w": 0.9071386430678465, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 760.1171875, "completions/mean_terminated_length": 766.1023559570312, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.096, "grad_norm": 0.020056363195180893, "kl": 0.157623291015625, "learning_rate": 3.055555555555556e-06, "loss": 0.0471, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02005939930677414, "mask/share_reasoning": 0.8588818311691284, "mask/share_step_conf": 0.11324631422758102, "num_tokens": 27471766.0, "reward": 0.659142255783081, "reward_std": 0.35710757970809937, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6485495567321777, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3439536690711975, "step": 90 }, { "adv/mean_abs_final_conf": 0.6580387353897095, "adv/mean_abs_reasoning": 0.5622403621673584, "adv/mean_abs_step_conf": 0.7587998509407043, "adv/ratio_final_to_reasoning": 1.1703868659536674, "adv/ratio_step_to_reasoning": 1.349600459162406, "adv/std_final_conf": 0.8453028202056885, "adv/std_reasoning": 0.8099509477615356, "adv/std_step_conf": 0.9357338547706604, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5231911599099099, "calib/avg_num_step_conf": 7.49609375, "calib/ece": 0.37295081967213106, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9795081967213115, "calib/gap": 0.02687218468468444, "calib/mean_conf": 0.9782786885245903, "calib/mu_c": 0.9888513513513512, "calib/mu_w": 0.9619791666666667, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3723360655737704, "calib/std_conf": 0.08907716881916157, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9064558139534884, "calib/step_q_c_n": 1075.0, "calib/step_q_gap": -0.0003095888901135657, "calib/step_q_w": 0.906765402843602, "calib/step_q_w_n": 844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 837.46875, "completions/mean_terminated_length": 850.761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.09706666666666666, "grad_norm": 0.013803146779537201, "kl": 0.146087646484375, "learning_rate": 3.0277777777777776e-06, "loss": 0.0097, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01792112924158573, "mask/share_reasoning": 0.867091178894043, "mask/share_step_conf": 0.09936273097991943, "num_tokens": 27793870.0, "reward": 0.5871576070785522, "reward_std": 0.4421505331993103, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5986281037330627, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.26943719387054443, "step": 91 }, { "adv/mean_abs_final_conf": 0.5850780010223389, "adv/mean_abs_reasoning": 0.413005530834198, "adv/mean_abs_step_conf": 0.7297500371932983, "adv/ratio_final_to_reasoning": 1.4166347841409894, "adv/ratio_step_to_reasoning": 1.766925580195821, "adv/std_final_conf": 0.8069612979888916, "adv/std_reasoning": 0.7204309105873108, "adv/std_step_conf": 0.9356918334960938, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5852906050955414, "calib/avg_num_step_conf": 8.05078125, "calib/ece": 0.3537351778656127, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9683794466403162, "calib/gap": 0.03983114384288711, "calib/mean_conf": 0.973102766798419, "calib/mu_c": 0.988216560509554, "calib/mu_w": 0.9483854166666669, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35314229249011864, "calib/std_conf": 0.1035456273768553, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9083766233766233, "calib/step_q_c_n": 1232.0, "calib/step_q_gap": 0.0006444158977331194, "calib/step_q_w": 0.9077322074788902, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 708.21875, "completions/mean_terminated_length": 713.7952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.09813333333333334, "grad_norm": 0.016659297049045563, "kl": 0.17535400390625, "learning_rate": 3e-06, "loss": 0.0304, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021326087415218353, "mask/share_reasoning": 0.8476418852806091, "mask/share_step_conf": 0.12321953475475311, "num_tokens": 28081894.0, "reward": 0.6601039171218872, "reward_std": 0.32772600650787354, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6406811475753784, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3592141270637512, "step": 92 }, { "adv/mean_abs_final_conf": 0.6216708421707153, "adv/mean_abs_reasoning": 0.5425342321395874, "adv/mean_abs_step_conf": 0.7974840402603149, "adv/ratio_final_to_reasoning": 1.1458647313719497, "adv/ratio_step_to_reasoning": 1.469923910819939, "adv/std_final_conf": 0.8077294230461121, "adv/std_reasoning": 0.7755074501037598, "adv/std_step_conf": 0.9356343746185303, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5152945870461498, "calib/avg_num_step_conf": 7.70703125, "calib/ece": 0.39759036144578325, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.01350113046947743, "calib/mean_conf": 0.9839357429718877, "calib/mu_c": 0.9895205479452055, "calib/mu_w": 0.9760194174757281, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39759036144578325, "calib/std_conf": 0.06190233828775913, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9072117962466488, "calib/step_q_c_n": 1119.0, "calib/step_q_gap": 0.007246925052269537, "calib/step_q_w": 0.8999648711943793, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 769.86328125, "completions/mean_terminated_length": 775.9251708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0992, "grad_norm": 0.013393659144639969, "kl": 0.1565093994140625, "learning_rate": 2.9722222222222225e-06, "loss": 0.026, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019578615203499794, "mask/share_reasoning": 0.8596739768981934, "mask/share_step_conf": 0.1129348874092102, "num_tokens": 28384755.0, "reward": 0.5730220079421997, "reward_std": 0.4286647140979767, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.585631251335144, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.251818984746933, "step": 93 }, { "adv/mean_abs_final_conf": 0.631345808506012, "adv/mean_abs_reasoning": 0.504591703414917, "adv/mean_abs_step_conf": 0.7707089781761169, "adv/ratio_final_to_reasoning": 1.2512013262074333, "adv/ratio_step_to_reasoning": 1.5273913006500155, "adv/std_final_conf": 0.8334376811981201, "adv/std_reasoning": 0.7754099369049072, "adv/std_step_conf": 0.9354292750358582, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5365646258503401, "calib/avg_num_step_conf": 7.4921875, "calib/ece": 0.41232653061224495, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9795918367346939, "calib/gap": 0.013904761904761642, "calib/mean_conf": 0.9837551020408163, "calib/mu_c": 0.9897142857142855, "calib/mu_w": 0.9758095238095239, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41232653061224495, "calib/std_conf": 0.057645833030722286, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9093100189035918, "calib/step_q_c_n": 1058.0, "calib/step_q_gap": 0.011997228205917199, "calib/step_q_w": 0.8973127906976746, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 762.890625, "completions/mean_terminated_length": 775.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.10026666666666667, "grad_norm": 0.026255443692207336, "kl": 0.170745849609375, "learning_rate": 2.944444444444445e-06, "loss": 0.0065, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019794750958681107, "mask/share_reasoning": 0.8554838299751282, "mask/share_step_conf": 0.10909644514322281, "num_tokens": 28688735.0, "reward": 0.5537281036376953, "reward_std": 0.37324994802474976, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5632859468460083, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.24417030811309814, "step": 94 }, { "adv/mean_abs_final_conf": 0.5990731120109558, "adv/mean_abs_reasoning": 0.49023786187171936, "adv/mean_abs_step_conf": 0.7478466033935547, "adv/ratio_final_to_reasoning": 1.2220049869744973, "adv/ratio_step_to_reasoning": 1.5254770419777244, "adv/std_final_conf": 0.8059893250465393, "adv/std_reasoning": 0.7575047016143799, "adv/std_step_conf": 0.9361989498138428, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.534026402640264, "calib/avg_num_step_conf": 7.7265625, "calib/ece": 0.37441832669322705, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": 0.043321386138613915, "calib/mean_conf": 0.9720278884462153, "calib/mu_c": 0.9894599999999999, "calib/mu_w": 0.946138613861386, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37441832669322705, "calib/std_conf": 0.11924745446982661, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9066677908937605, "calib/step_q_c_n": 1186.0, "calib/step_q_gap": 0.006187992913962437, "calib/step_q_w": 0.900479797979798, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 769.3359375, "completions/mean_terminated_length": 775.3936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 362.0, "epoch": 0.10133333333333333, "grad_norm": 0.012575649656355381, "kl": 0.1600494384765625, "learning_rate": 2.916666666666667e-06, "loss": -0.0173, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01988060027360916, "mask/share_reasoning": 0.8578656911849976, "mask/share_step_conf": 0.11444119364023209, "num_tokens": 28991813.0, "reward": 0.5770303010940552, "reward_std": 0.41374671459198, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6137281060218811, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2270512878894806, "step": 95 }, { "adv/mean_abs_final_conf": 0.594207763671875, "adv/mean_abs_reasoning": 0.4591464400291443, "adv/mean_abs_step_conf": 0.7937597036361694, "adv/ratio_final_to_reasoning": 1.2941574013601362, "adv/ratio_step_to_reasoning": 1.7287724229894619, "adv/std_final_conf": 0.8037899732589722, "adv/std_reasoning": 0.7206980586051941, "adv/std_step_conf": 0.9362131953239441, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4961742424242424, "calib/avg_num_step_conf": 7.37109375, "calib/ece": 0.2850199203187249, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.013834848484848616, "calib/mean_conf": 0.9814342629482073, "calib/mu_c": 0.9855681818181818, "calib/mu_w": 0.9717333333333332, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28262948207171296, "calib/std_conf": 0.06363127301674591, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.906852986217458, "calib/step_q_c_n": 1306.0, "calib/step_q_gap": 0.005803072275977872, "calib/step_q_w": 0.9010499139414802, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 717.26171875, "completions/mean_terminated_length": 722.909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.1024, "grad_norm": 0.02010061964392662, "kl": 0.1659393310546875, "learning_rate": 2.888888888888889e-06, "loss": 0.0107, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020429600030183792, "mask/share_reasoning": 0.8600814342498779, "mask/share_step_conf": 0.11167645454406738, "num_tokens": 29281248.0, "reward": 0.7246764898300171, "reward_std": 0.400970995426178, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6997547149658203, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.4160044193267822, "step": 96 }, { "adv/mean_abs_final_conf": 0.6796474456787109, "adv/mean_abs_reasoning": 0.5447673797607422, "adv/mean_abs_step_conf": 0.7568874359130859, "adv/ratio_final_to_reasoning": 1.2475920382332861, "adv/ratio_step_to_reasoning": 1.3893773086147436, "adv/std_final_conf": 0.8596211671829224, "adv/std_reasoning": 0.792809247970581, "adv/std_step_conf": 0.9363095760345459, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5188064859117492, "calib/avg_num_step_conf": 8.1484375, "calib/ece": 0.38115537848605596, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.0034456406166932796, "calib/mean_conf": 0.9867330677290838, "calib/mu_c": 0.9880921052631578, "calib/mu_w": 0.9846464646464645, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38115537848605596, "calib/std_conf": 0.012452477879124065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9106488240064882, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": 0.003204509821259527, "calib/step_q_w": 0.9074443141852286, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 768.609375, "completions/mean_terminated_length": 768.609375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.10346666666666667, "grad_norm": 0.02059132792055607, "kl": 0.157989501953125, "learning_rate": 2.861111111111111e-06, "loss": 0.0107, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019798683002591133, "mask/share_reasoning": 0.8629213571548462, "mask/share_step_conf": 0.1172800213098526, "num_tokens": 29583084.0, "reward": 0.5847292542457581, "reward_std": 0.42039793729782104, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6053003668785095, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.24931436777114868, "step": 97 }, { "adv/mean_abs_final_conf": 0.5588270425796509, "adv/mean_abs_reasoning": 0.44539546966552734, "adv/mean_abs_step_conf": 0.7786312699317932, "adv/ratio_final_to_reasoning": 1.2546760814592608, "adv/ratio_step_to_reasoning": 1.7481795908622768, "adv/std_final_conf": 0.7846071720123291, "adv/std_reasoning": 0.7393063902854919, "adv/std_step_conf": 0.9350544214248657, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5123342175066312, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.4010682730923695, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9839357429718876, "calib/gap": 0.012817904509283773, "calib/mean_conf": 0.9817911646586346, "calib/mu_c": 0.9871448275862069, "calib/mu_w": 0.9743269230769231, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40026506024096387, "calib/std_conf": 0.06544126219319697, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075486725663718, "calib/step_q_c_n": 1130.0, "calib/step_q_gap": -0.001924408762379648, "calib/step_q_w": 0.9094730813287515, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 788.95703125, "completions/mean_terminated_length": 788.95703125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.10453333333333334, "grad_norm": 0.014475611969828606, "kl": 0.1655731201171875, "learning_rate": 2.8333333333333335e-06, "loss": 0.1126, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01985691487789154, "mask/share_reasoning": 0.8670278787612915, "mask/share_step_conf": 0.11311522126197815, "num_tokens": 29891241.0, "reward": 0.5559430122375488, "reward_std": 0.37032419443130493, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5827776193618774, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.22129589319229126, "step": 98 }, { "adv/mean_abs_final_conf": 0.6716063618659973, "adv/mean_abs_reasoning": 0.5517687797546387, "adv/mean_abs_step_conf": 0.7963594198226929, "adv/ratio_final_to_reasoning": 1.2171880441743155, "adv/ratio_step_to_reasoning": 1.44328466749572, "adv/std_final_conf": 0.8631746768951416, "adv/std_reasoning": 0.7929278016090393, "adv/std_step_conf": 0.9363625645637512, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5258838383838383, "calib/avg_num_step_conf": 8.015625, "calib/ece": 0.5740329218106998, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9753086419753086, "calib/gap": 0.010372474747474714, "calib/mean_conf": 0.9752674897119343, "calib/mu_c": 0.9814141414141415, "calib/mu_w": 0.9710416666666668, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5709465020576133, "calib/std_conf": 0.09517273905879559, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9105185185185185, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.016436392914653708, "calib/step_q_w": 0.8940821256038648, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 832.1796875, "completions/mean_terminated_length": 848.7570190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.1056, "grad_norm": 0.0244298093020916, "kl": 0.1494598388671875, "learning_rate": 2.805555555555556e-06, "loss": -0.0562, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017815835773944855, "mask/share_reasoning": 0.8580272197723389, "mask/share_step_conf": 0.10462567210197449, "num_tokens": 30210079.0, "reward": 0.2601035237312317, "reward_std": 0.4317253828048706, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.41011834144592285, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": -0.15788006782531738, "step": 99 }, { "adv/mean_abs_final_conf": 0.6238076686859131, "adv/mean_abs_reasoning": 0.45149117708206177, "adv/mean_abs_step_conf": 0.7373534440994263, "adv/ratio_final_to_reasoning": 1.3816608172002696, "adv/ratio_step_to_reasoning": 1.6331513914953137, "adv/std_final_conf": 0.8372292518615723, "adv/std_reasoning": 0.7392777800559998, "adv/std_step_conf": 0.9361663460731506, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5089066339066339, "calib/avg_num_step_conf": 7.625, "calib/ece": 0.38210526315789495, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9595141700404858, "calib/gap": 0.022002457002456954, "calib/mean_conf": 0.965910931174089, "calib/mu_c": 0.9747297297297297, "calib/mu_w": 0.9527272727272728, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3744129554655872, "calib/std_conf": 0.12706226024148723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9030574912891987, "calib/step_q_c_n": 1148.0, "calib/step_q_gap": -0.001096737566522843, "calib/step_q_w": 0.9041542288557215, "calib/step_q_w_n": 804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 774.9375, "completions/mean_terminated_length": 787.2381591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.10666666666666667, "grad_norm": 0.019371310248970985, "kl": 0.1641845703125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0149, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019257165491580963, "mask/share_reasoning": 0.8558045029640198, "mask/share_step_conf": 0.10931334644556046, "num_tokens": 30515871.0, "reward": 0.5841464400291443, "reward_std": 0.3418727517127991, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.597989022731781, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2601475417613983, "step": 100 }, { "adv/mean_abs_final_conf": 0.6889234781265259, "adv/mean_abs_reasoning": 0.6015197038650513, "adv/mean_abs_step_conf": 0.7753009796142578, "adv/ratio_final_to_reasoning": 1.1453049230139323, "adv/ratio_step_to_reasoning": 1.2889037127671443, "adv/std_final_conf": 0.8938189744949341, "adv/std_reasoning": 0.859066903591156, "adv/std_step_conf": 0.9366356730461121, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5376422934562469, "calib/avg_num_step_conf": 8.26953125, "calib/ece": 0.5155416666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9708333333333333, "calib/gap": 0.012023884349465663, "calib/mean_conf": 0.9780416666666667, "calib/mu_c": 0.9845045045045044, "calib/mu_w": 0.9724806201550388, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5155416666666667, "calib/std_conf": 0.07245629669363152, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.891039387308534, "calib/step_q_c_n": 914.0, "calib/step_q_gap": -0.01196144394666121, "calib/step_q_w": 0.9030008312551953, "calib/step_q_w_n": 1203.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 840.95703125, "completions/mean_terminated_length": 861.1400146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.10773333333333333, "grad_norm": 0.01625613309442997, "kl": 0.1616668701171875, "learning_rate": 2.7500000000000004e-06, "loss": -0.0316, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01756630465388298, "mask/share_reasoning": 0.8519110083580017, "mask/share_step_conf": 0.1070852130651474, "num_tokens": 30838148.0, "reward": 0.41306352615356445, "reward_std": 0.47633716464042664, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.45595428347587585, "rewards/format_reward_step": 0.9375, "rewards/step_correlation_reward": 0.09517276287078857, "step": 101 }, { "adv/mean_abs_final_conf": 0.564067542552948, "adv/mean_abs_reasoning": 0.3646267056465149, "adv/mean_abs_step_conf": 0.7716248631477356, "adv/ratio_final_to_reasoning": 1.546972653999128, "adv/ratio_step_to_reasoning": 2.116205015152627, "adv/std_final_conf": 0.7669880986213684, "adv/std_reasoning": 0.640241801738739, "adv/std_step_conf": 0.9355287551879883, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5437934458788481, "calib/avg_num_step_conf": 8.2890625, "calib/ece": 0.3619685039370081, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9763779527559056, "calib/gap": 0.0031658391261171204, "calib/mean_conf": 0.9769291338582676, "calib/mu_c": 0.9781132075471697, "calib/mu_w": 0.9749473684210526, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.356456692913386, "calib/std_conf": 0.07518493536306997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9006338553318419, "calib/step_q_c_n": 1341.0, "calib/step_q_gap": -0.005704172837172283, "calib/step_q_w": 0.9063380281690142, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 710.234375, "completions/mean_terminated_length": 713.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.1088, "grad_norm": 0.013416235335171223, "kl": 0.192138671875, "learning_rate": 2.7222222222222224e-06, "loss": 0.0266, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021066762506961823, "mask/share_reasoning": 0.8479097485542297, "mask/share_step_conf": 0.12711723148822784, "num_tokens": 31126664.0, "reward": 0.6091418862342834, "reward_std": 0.3119138479232788, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.633550763130188, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.2620766758918762, "step": 102 }, { "adv/mean_abs_final_conf": 0.6057738065719604, "adv/mean_abs_reasoning": 0.460005521774292, "adv/mean_abs_step_conf": 0.7314463257789612, "adv/ratio_final_to_reasoning": 1.3168837718195732, "adv/ratio_step_to_reasoning": 1.5900816211024862, "adv/std_final_conf": 0.790691077709198, "adv/std_reasoning": 0.7393838763237, "adv/std_step_conf": 0.9350913166999817, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5302175956047134, "calib/avg_num_step_conf": 7.69140625, "calib/ece": 0.33369918699187007, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.975609756097561, "calib/gap": 0.013934070700499235, "calib/mean_conf": 0.9800406504065042, "calib/mu_c": 0.9849685534591196, "calib/mu_w": 0.9710344827586204, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33369918699187007, "calib/std_conf": 0.0657322603815016, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9058534621578099, "calib/step_q_c_n": 1242.0, "calib/step_q_gap": -0.001973222849067535, "calib/step_q_w": 0.9078266850068775, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 804.5546875, "completions/mean_terminated_length": 820.5817260742188, "completions/min_length": 0.0, "completions/min_terminated_length": 64.0, "epoch": 0.10986666666666667, "grad_norm": 0.019397255033254623, "kl": 0.150115966796875, "learning_rate": 2.6944444444444444e-06, "loss": -0.0785, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01852298527956009, "mask/share_reasoning": 0.8557056188583374, "mask/share_step_conf": 0.1062401756644249, "num_tokens": 31437182.0, "reward": 0.6300480365753174, "reward_std": 0.3218870460987091, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6362464427947998, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.30666205286979675, "step": 103 }, { "adv/mean_abs_final_conf": 0.682067334651947, "adv/mean_abs_reasoning": 0.4738949239253998, "adv/mean_abs_step_conf": 0.7475656867027283, "adv/ratio_final_to_reasoning": 1.4392796804029866, "adv/ratio_step_to_reasoning": 1.5774924966708646, "adv/std_final_conf": 0.8657509684562683, "adv/std_reasoning": 0.7575639486312866, "adv/std_step_conf": 0.9358437061309814, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5349270612428507, "calib/avg_num_step_conf": 8.19140625, "calib/ece": 0.43314, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.948, "calib/gap": 0.03745549771865553, "calib/mean_conf": 0.9651400000000001, "calib/mu_c": 0.9826691729323308, "calib/mu_w": 0.9452136752136753, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43314, "calib/std_conf": 0.11107871263207905, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9072242990654206, "calib/step_q_c_n": 1070.0, "calib/step_q_gap": 0.0011580867966766162, "calib/step_q_w": 0.906066212268744, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 779.765625, "completions/mean_terminated_length": 779.765625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.11093333333333333, "grad_norm": 0.02324623055756092, "kl": 0.1611480712890625, "learning_rate": 2.666666666666667e-06, "loss": 0.0021, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019226912409067154, "mask/share_reasoning": 0.8651635646820068, "mask/share_step_conf": 0.1156095415353775, "num_tokens": 31743482.0, "reward": 0.5363671779632568, "reward_std": 0.3894505500793457, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5563733577728271, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2171422243118286, "step": 104 }, { "adv/mean_abs_final_conf": 0.6646937727928162, "adv/mean_abs_reasoning": 0.5343684554100037, "adv/mean_abs_step_conf": 0.7638580799102783, "adv/ratio_final_to_reasoning": 1.2438866217932307, "adv/ratio_step_to_reasoning": 1.429459527741387, "adv/std_final_conf": 0.8703509569168091, "adv/std_reasoning": 0.7928860783576965, "adv/std_step_conf": 0.9356840252876282, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6223418306696605, "calib/avg_num_step_conf": 8.79296875, "calib/ece": 0.4010121457489878, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9109311740890689, "calib/gap": 0.09539426760005276, "calib/mean_conf": 0.9435222672064778, "calib/mu_c": 0.9871641791044776, "calib/mu_w": 0.8917699115044249, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4010121457489878, "calib/std_conf": 0.1719056754300515, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9093078324225865, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": 0.010279211433861457, "calib/step_q_w": 0.899028620988725, "calib/step_q_w_n": 1153.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 807.8515625, "completions/mean_terminated_length": 820.6746215820312, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.112, "grad_norm": 0.021434539929032326, "kl": 0.152801513671875, "learning_rate": 2.6388888888888893e-06, "loss": -0.0398, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018452614545822144, "mask/share_reasoning": 0.8555814027786255, "mask/share_step_conf": 0.11034099757671356, "num_tokens": 32056052.0, "reward": 0.5344028472900391, "reward_std": 0.4172634780406952, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.587394118309021, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1829739362001419, "step": 105 }, { "adv/mean_abs_final_conf": 0.6361554861068726, "adv/mean_abs_reasoning": 0.4690241813659668, "adv/mean_abs_step_conf": 0.769987940788269, "adv/ratio_final_to_reasoning": 1.3563383539291287, "adv/ratio_step_to_reasoning": 1.641680688074094, "adv/std_final_conf": 0.8325013518333435, "adv/std_reasoning": 0.7576159238815308, "adv/std_step_conf": 0.9357642531394958, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5327891156462585, "calib/avg_num_step_conf": 7.98046875, "calib/ece": 0.39783673469387776, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9551020408163265, "calib/gap": 0.02766666666666684, "calib/mean_conf": 0.9660000000000001, "calib/mu_c": 0.9778571428571431, "calib/mu_w": 0.9501904761904763, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39620408163265325, "calib/std_conf": 0.10906429483640953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9052833638025595, "calib/step_q_c_n": 1094.0, "calib/step_q_gap": -0.0038314939424353156, "calib/step_q_w": 0.9091148577449948, "calib/step_q_w_n": 949.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 743.40234375, "completions/mean_terminated_length": 758.211181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.11306666666666666, "grad_norm": 0.0160527266561985, "kl": 0.16754150390625, "learning_rate": 2.6111111111111113e-06, "loss": -0.0214, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01931002363562584, "mask/share_reasoning": 0.850881814956665, "mask/share_step_conf": 0.1102769672870636, "num_tokens": 32350947.0, "reward": 0.5429900288581848, "reward_std": 0.348480224609375, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.575013279914856, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.21018551290035248, "step": 106 }, { "adv/mean_abs_final_conf": 0.6448113322257996, "adv/mean_abs_reasoning": 0.5004734992980957, "adv/mean_abs_step_conf": 0.7651152014732361, "adv/ratio_final_to_reasoning": 1.288402549046323, "adv/ratio_step_to_reasoning": 1.5287826479250055, "adv/std_final_conf": 0.8311536908149719, "adv/std_reasoning": 0.7575410604476929, "adv/std_step_conf": 0.9356609582901001, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5640840220385676, "calib/avg_num_step_conf": 8.85546875, "calib/ece": 0.3256521739130436, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9683794466403162, "calib/gap": 0.028325757575757615, "calib/mean_conf": 0.9742687747035573, "calib/mu_c": 0.9841212121212123, "calib/mu_w": 0.9557954545454547, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32387351778656137, "calib/std_conf": 0.07904043402431196, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9090238611713665, "calib/step_q_c_n": 1383.0, "calib/step_q_gap": -0.011258944258497938, "calib/step_q_w": 0.9202828054298644, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 729.65234375, "completions/mean_terminated_length": 738.3043823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.11413333333333334, "grad_norm": 0.013503472320735455, "kl": 0.173858642578125, "learning_rate": 2.5833333333333337e-06, "loss": 0.0111, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02020452544093132, "mask/share_reasoning": 0.840317964553833, "mask/share_step_conf": 0.12775877118110657, "num_tokens": 32642354.0, "reward": 0.6801269054412842, "reward_std": 0.3871152102947235, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6680933237075806, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3655979335308075, "step": 107 }, { "adv/mean_abs_final_conf": 0.6603453755378723, "adv/mean_abs_reasoning": 0.5115096569061279, "adv/mean_abs_step_conf": 0.7684875726699829, "adv/ratio_final_to_reasoning": 1.2909734285995282, "adv/ratio_step_to_reasoning": 1.5023911323946235, "adv/std_final_conf": 0.8369495272636414, "adv/std_reasoning": 0.7754160761833191, "adv/std_step_conf": 0.9356595277786255, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5251436781609196, "calib/avg_num_step_conf": 7.94140625, "calib/ece": 0.26373983739837403, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9512195121951219, "calib/gap": 0.031005747126436667, "calib/mean_conf": 0.9694308943089431, "calib/mu_c": 0.9785057471264367, "calib/mu_w": 0.9475, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2629268292682927, "calib/std_conf": 0.07611289019392023, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9059385903698535, "calib/step_q_c_n": 1433.0, "calib/step_q_gap": 0.005038590369853546, "calib/step_q_w": 0.9008999999999999, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 792.09765625, "completions/mean_terminated_length": 798.3346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.1152, "grad_norm": 0.01923747919499874, "kl": 0.1485748291015625, "learning_rate": 2.5555555555555557e-06, "loss": 0.0034, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0191725455224514, "mask/share_reasoning": 0.85679030418396, "mask/share_step_conf": 0.11622464656829834, "num_tokens": 32948363.0, "reward": 0.7075442671775818, "reward_std": 0.37582361698150635, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7027539014816284, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.3834283649921417, "step": 108 }, { "adv/mean_abs_final_conf": 0.62883460521698, "adv/mean_abs_reasoning": 0.36270871758461, "adv/mean_abs_step_conf": 0.7444422245025635, "adv/ratio_final_to_reasoning": 1.733717924963549, "adv/ratio_step_to_reasoning": 2.0524519770576113, "adv/std_final_conf": 0.8395936489105225, "adv/std_reasoning": 0.6612196564674377, "adv/std_step_conf": 0.9360567927360535, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6210819672131147, "calib/avg_num_step_conf": 8.47265625, "calib/ece": 0.44287449392712563, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8785425101214575, "calib/gap": 0.058613770491803274, "calib/mean_conf": 0.9440890688259109, "calib/mu_c": 0.9730400000000001, "calib/mu_w": 0.9144262295081969, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4404453441295548, "calib/std_conf": 0.1363487140527158, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.905908658420552, "calib/step_q_c_n": 1051.0, "calib/step_q_gap": 0.015935492052036815, "calib/step_q_w": 0.8899731663685152, "calib/step_q_w_n": 1118.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 788.91015625, "completions/mean_terminated_length": 804.6255493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.11626666666666667, "grad_norm": 0.020503204315900803, "kl": 0.163543701171875, "learning_rate": 2.5277777777777778e-06, "loss": -0.0104, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01863759383559227, "mask/share_reasoning": 0.8506479263305664, "mask/share_step_conf": 0.11118321120738983, "num_tokens": 33254924.0, "reward": 0.4792168438434601, "reward_std": 0.3232100009918213, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.548890233039856, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1181371733546257, "step": 109 }, { "adv/mean_abs_final_conf": 0.6838954091072083, "adv/mean_abs_reasoning": 0.4938781261444092, "adv/mean_abs_step_conf": 0.7572989463806152, "adv/ratio_final_to_reasoning": 1.3847452901917716, "adv/ratio_step_to_reasoning": 1.5333721140733052, "adv/std_final_conf": 0.8846848607063293, "adv/std_reasoning": 0.7575966119766235, "adv/std_step_conf": 0.9364821910858154, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5694776714513556, "calib/avg_num_step_conf": 8.734375, "calib/ece": 0.37115537848605595, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9243027888446215, "calib/gap": 0.01622541201488581, "calib/mean_conf": 0.965179282868526, "calib/mu_c": 0.9715789473684211, "calib/mu_w": 0.9553535353535353, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36537848605577705, "calib/std_conf": 0.08533194925016266, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9116350123864575, "calib/step_q_c_n": 1211.0, "calib/step_q_gap": -0.005994255906225487, "calib/step_q_w": 0.917629268292683, "calib/step_q_w_n": 1025.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 742.86328125, "completions/mean_terminated_length": 748.7125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.11733333333333333, "grad_norm": 0.01990644447505474, "kl": 0.1675872802734375, "learning_rate": 2.5e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020663520321249962, "mask/share_reasoning": 0.8460651636123657, "mask/share_step_conf": 0.12545883655548096, "num_tokens": 33550017.0, "reward": 0.6041939854621887, "reward_std": 0.40416789054870605, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6198753714561462, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.27445003390312195, "step": 110 }, { "adv/mean_abs_final_conf": 0.7322089076042175, "adv/mean_abs_reasoning": 0.5857294797897339, "adv/mean_abs_step_conf": 0.7790755033493042, "adv/ratio_final_to_reasoning": 1.2500803406157175, "adv/ratio_step_to_reasoning": 1.33009440403952, "adv/std_final_conf": 0.8689647912979126, "adv/std_reasoning": 0.8266346454620361, "adv/std_step_conf": 0.9363865256309509, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6608518047277842, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.3606910569105691, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8252032520325203, "calib/gap": 0.10853746735418202, "calib/mean_conf": 0.9176016260162603, "calib/mu_c": 0.9656934306569344, "calib/mu_w": 0.8571559633027523, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3606910569105691, "calib/std_conf": 0.18651581767040165, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.906980244590781, "calib/step_q_c_n": 1063.0, "calib/step_q_gap": 0.006221697582233898, "calib/step_q_w": 0.9007585470085471, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 799.48828125, "completions/mean_terminated_length": 808.9683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.1184, "grad_norm": 0.027693284675478935, "kl": 0.1625518798828125, "learning_rate": 2.4722222222222226e-06, "loss": 0.0956, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01975979655981064, "mask/share_reasoning": 0.8559533357620239, "mask/share_step_conf": 0.11256811767816544, "num_tokens": 33862094.0, "reward": 0.5763823986053467, "reward_std": 0.4253450036048889, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6168433427810669, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.23592141270637512, "step": 111 }, { "adv/mean_abs_final_conf": 0.7116504311561584, "adv/mean_abs_reasoning": 0.5469608306884766, "adv/mean_abs_step_conf": 0.7545567154884338, "adv/ratio_final_to_reasoning": 1.3010994411800603, "adv/ratio_step_to_reasoning": 1.3795443350827332, "adv/std_final_conf": 0.8786104917526245, "adv/std_reasoning": 0.7929294109344482, "adv/std_step_conf": 0.9363153576850891, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6716371863430688, "calib/avg_num_step_conf": 7.8203125, "calib/ece": 0.32983673469387764, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7877551020408163, "calib/gap": 0.12024818318935948, "calib/mean_conf": 0.8827346938775512, "calib/mu_c": 0.9327972027972027, "calib/mu_w": 0.8125490196078432, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3144489795918368, "calib/std_conf": 0.23465709028457812, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8976138279932547, "calib/step_q_c_n": 1186.0, "calib/step_q_gap": 0.00026088681678415604, "calib/step_q_w": 0.8973529411764706, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 798.03125, "completions/mean_terminated_length": 813.9282836914062, "completions/min_length": 0.0, "completions/min_terminated_length": 395.0, "epoch": 0.11946666666666667, "grad_norm": 0.02171127125620842, "kl": 0.1656036376953125, "learning_rate": 2.4444444444444447e-06, "loss": -0.0284, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018730318173766136, "mask/share_reasoning": 0.8514347076416016, "mask/share_step_conf": 0.11030372977256775, "num_tokens": 34174310.0, "reward": 0.5774442553520203, "reward_std": 0.38559865951538086, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6421105265617371, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.20965296030044556, "step": 112 }, { "adv/mean_abs_final_conf": 0.7501026391983032, "adv/mean_abs_reasoning": 0.5847432017326355, "adv/mean_abs_step_conf": 0.7957204580307007, "adv/ratio_final_to_reasoning": 1.2827898417214532, "adv/ratio_step_to_reasoning": 1.3608032648740929, "adv/std_final_conf": 0.8902297019958496, "adv/std_reasoning": 0.8099881410598755, "adv/std_step_conf": 0.9362003803253174, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5998031763268632, "calib/avg_num_step_conf": 8.3046875, "calib/ece": 0.37061224489795924, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8163265306122449, "calib/gap": 0.06876611918012743, "calib/mean_conf": 0.9269387755102042, "calib/mu_c": 0.9566906474820143, "calib/mu_w": 0.8879245283018868, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3651020408163266, "calib/std_conf": 0.15780385760827156, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9058223394898858, "calib/step_q_c_n": 1137.0, "calib/step_q_gap": 0.022546303089481246, "calib/step_q_w": 0.8832760364004045, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 722.87890625, "completions/mean_terminated_length": 743.2008056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.12053333333333334, "grad_norm": 0.01958407275378704, "kl": 0.196502685546875, "learning_rate": 2.4166666666666667e-06, "loss": -0.0467, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020146111026406288, "mask/share_reasoning": 0.8327199220657349, "mask/share_step_conf": 0.1197902113199234, "num_tokens": 34464567.0, "reward": 0.5459908246994019, "reward_std": 0.43909066915512085, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6068406105041504, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.18435978889465332, "step": 113 }, { "adv/mean_abs_final_conf": 0.6415958404541016, "adv/mean_abs_reasoning": 0.44854453206062317, "adv/mean_abs_step_conf": 0.7478067874908447, "adv/ratio_final_to_reasoning": 1.4303949654822377, "adv/ratio_step_to_reasoning": 1.6671851600896892, "adv/std_final_conf": 0.8132436275482178, "adv/std_reasoning": 0.739156186580658, "adv/std_step_conf": 0.9348743557929993, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6588422656141808, "calib/avg_num_step_conf": 8.20703125, "calib/ece": 0.2722134387351779, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8537549407114624, "calib/gap": 0.12430272815399512, "calib/mean_conf": 0.9283399209486165, "calib/mu_c": 0.9710843373493975, "calib/mu_w": 0.8467816091954024, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2722134387351779, "calib/std_conf": 0.16779555090747067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9076392961876832, "calib/step_q_c_n": 1364.0, "calib/step_q_gap": 0.016268875563531182, "calib/step_q_w": 0.8913704206241521, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 699.87109375, "completions/mean_terminated_length": 705.3818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.1216, "grad_norm": 0.018764222040772438, "kl": 0.18096923828125, "learning_rate": 2.388888888888889e-06, "loss": 0.011, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021557755768299103, "mask/share_reasoning": 0.8419281244277954, "mask/share_step_conf": 0.1287015974521637, "num_tokens": 34748758.0, "reward": 0.7011272311210632, "reward_std": 0.34007567167282104, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7196776866912842, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.35523295402526855, "step": 114 }, { "adv/mean_abs_final_conf": 0.75199294090271, "adv/mean_abs_reasoning": 0.4932301640510559, "adv/mean_abs_step_conf": 0.7876946926116943, "adv/ratio_final_to_reasoning": 1.5246288562855792, "adv/ratio_step_to_reasoning": 1.5970124092616473, "adv/std_final_conf": 0.8954560160636902, "adv/std_reasoning": 0.7394359111785889, "adv/std_step_conf": 0.9358144402503967, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.58203026481715, "calib/avg_num_step_conf": 7.95703125, "calib/ece": 0.43460317460317477, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7936507936507936, "calib/gap": 0.06140983606557393, "calib/mean_conf": 0.9187301587301587, "calib/mu_c": 0.9504098360655738, "calib/mu_w": 0.8889999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.43460317460317477, "calib/std_conf": 0.14954154345008988, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9002152852529602, "calib/step_q_c_n": 929.0, "calib/step_q_gap": -0.007970635324657027, "calib/step_q_w": 0.9081859205776173, "calib/step_q_w_n": 1108.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 754.8359375, "completions/mean_terminated_length": 757.796142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.12266666666666666, "grad_norm": 0.03260897099971771, "kl": 0.18072509765625, "learning_rate": 2.361111111111111e-06, "loss": 0.0386, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019767742604017258, "mask/share_reasoning": 0.8599216938018799, "mask/share_step_conf": 0.11640430986881256, "num_tokens": 35047260.0, "reward": 0.46756333112716675, "reward_std": 0.4043220579624176, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5565828084945679, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.08791884034872055, "step": 115 }, { "adv/mean_abs_final_conf": 0.7125102281570435, "adv/mean_abs_reasoning": 0.5437763929367065, "adv/mean_abs_step_conf": 0.7766680717468262, "adv/ratio_final_to_reasoning": 1.310300037684749, "adv/ratio_step_to_reasoning": 1.4282857472947108, "adv/std_final_conf": 0.8894214034080505, "adv/std_reasoning": 0.7754684686660767, "adv/std_step_conf": 0.9361055493354797, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6414127887512554, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.30568548387096767, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7096774193548387, "calib/gap": 0.10666287244727146, "calib/mean_conf": 0.867217741935484, "calib/mu_c": 0.9115172413793105, "calib/mu_w": 0.804854368932039, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2941129032258064, "calib/std_conf": 0.22687729597110243, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9069122807017544, "calib/step_q_c_n": 1140.0, "calib/step_q_gap": 0.01308812815938154, "calib/step_q_w": 0.8938241525423729, "calib/step_q_w_n": 944.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 767.15234375, "completions/mean_terminated_length": 782.4342651367188, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.12373333333333333, "grad_norm": 0.021685892716050148, "kl": 0.1681671142578125, "learning_rate": 2.3333333333333336e-06, "loss": -0.0286, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019999029114842415, "mask/share_reasoning": 0.8439480066299438, "mask/share_step_conf": 0.11652170121669769, "num_tokens": 35348171.0, "reward": 0.5921616554260254, "reward_std": 0.39703428745269775, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6525866985321045, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2254866510629654, "step": 116 }, { "adv/mean_abs_final_conf": 0.7528530359268188, "adv/mean_abs_reasoning": 0.5408031940460205, "adv/mean_abs_step_conf": 0.7640414237976074, "adv/ratio_final_to_reasoning": 1.3921016817492273, "adv/ratio_step_to_reasoning": 1.4127901465992267, "adv/std_final_conf": 0.9162237048149109, "adv/std_reasoning": 0.7929183840751648, "adv/std_step_conf": 0.9363798499107361, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6521139705882353, "calib/avg_num_step_conf": 7.87890625, "calib/ece": 0.3748987854251013, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.728744939271255, "calib/gap": 0.1118034401260507, "calib/mean_conf": 0.8834008097165993, "calib/mu_c": 0.9372656250000002, "calib/mu_w": 0.8254621848739495, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.37004048582995963, "calib/std_conf": 0.19722922112952876, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.9014519230769231, "calib/step_q_c_n": 1040.0, "calib/step_q_gap": 0.0030179415006691412, "calib/step_q_w": 0.898433981576254, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 723.44921875, "completions/mean_terminated_length": 737.860595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.1248, "grad_norm": 0.026363488286733627, "kl": 0.1810150146484375, "learning_rate": 2.305555555555556e-06, "loss": -0.0393, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020078789442777634, "mask/share_reasoning": 0.8408745527267456, "mask/share_step_conf": 0.11951541900634766, "num_tokens": 35639974.0, "reward": 0.5382317304611206, "reward_std": 0.4197598695755005, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5978370904922485, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.1895637810230255, "step": 117 }, { "adv/mean_abs_final_conf": 0.676365315914154, "adv/mean_abs_reasoning": 0.3761560320854187, "adv/mean_abs_step_conf": 0.7523123025894165, "adv/ratio_final_to_reasoning": 1.7980977525851902, "adv/ratio_step_to_reasoning": 2.00000063382894, "adv/std_final_conf": 0.892826497554779, "adv/std_reasoning": 0.6815088391304016, "adv/std_step_conf": 0.9354512095451355, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.705430463576159, "calib/avg_num_step_conf": 8.625, "calib/ece": 0.24362549800796823, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.19732582781456953, "calib/mean_conf": 0.8225099601593625, "calib/mu_c": 0.9011258278145695, "calib/mu_w": 0.7038, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23227091633466146, "calib/std_conf": 0.25377719252646, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8957572383073497, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.019020885229533158, "calib/step_q_w": 0.8767363530778165, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 764.70703125, "completions/mean_terminated_length": 770.7283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.12586666666666665, "grad_norm": 0.02284431643784046, "kl": 0.16656494140625, "learning_rate": 2.277777777777778e-06, "loss": 0.0102, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02022838033735752, "mask/share_reasoning": 0.8493303656578064, "mask/share_step_conf": 0.12262875586748123, "num_tokens": 35939747.0, "reward": 0.6449134349822998, "reward_std": 0.24360501766204834, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7236628532409668, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2521013617515564, "step": 118 }, { "adv/mean_abs_final_conf": 0.7165407538414001, "adv/mean_abs_reasoning": 0.5705307126045227, "adv/mean_abs_step_conf": 0.7834902405738831, "adv/ratio_final_to_reasoning": 1.25591968672524, "adv/ratio_step_to_reasoning": 1.3732656687265465, "adv/std_final_conf": 0.8990749716758728, "adv/std_reasoning": 0.8099278211593628, "adv/std_step_conf": 0.9363142251968384, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7174507124731603, "calib/avg_num_step_conf": 7.9609375, "calib/ece": 0.19856000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.472, "calib/gap": 0.21342442579217902, "calib/mean_conf": 0.74872, "calib/mu_c": 0.84177304964539, "calib/mu_w": 0.628348623853211, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1916400000000001, "calib/std_conf": 0.28160532949502215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8987666034155597, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.014356034309868648, "calib/step_q_w": 0.884410569105691, "calib/step_q_w_n": 984.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 826.421875, "completions/mean_terminated_length": 832.9291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.12693333333333334, "grad_norm": 0.03922202065587044, "kl": 0.1675872802734375, "learning_rate": 2.25e-06, "loss": -0.0041, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01915661431849003, "mask/share_reasoning": 0.8651344776153564, "mask/share_step_conf": 0.10789638757705688, "num_tokens": 36256375.0, "reward": 0.6312820315361023, "reward_std": 0.356397807598114, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7280832529067993, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.22979329526424408, "step": 119 }, { "adv/mean_abs_final_conf": 0.7109464406967163, "adv/mean_abs_reasoning": 0.5058482885360718, "adv/mean_abs_step_conf": 0.7477468252182007, "adv/ratio_final_to_reasoning": 1.4054538817442674, "adv/ratio_step_to_reasoning": 1.4782037266196646, "adv/std_final_conf": 0.9009770154953003, "adv/std_reasoning": 0.7753273844718933, "adv/std_step_conf": 0.9359827041625977, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7323376977632295, "calib/avg_num_step_conf": 7.74609375, "calib/ece": 0.22012000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.528, "calib/gap": 0.20368657937806878, "calib/mean_conf": 0.7747600000000001, "calib/mu_c": 0.8513461538461539, "calib/mu_w": 0.6476595744680851, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18544000000000013, "calib/std_conf": 0.28982087985512706, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9006685714285714, "calib/step_q_c_n": 1225.0, "calib/step_q_gap": 0.0040063022992836705, "calib/step_q_w": 0.8966622691292877, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 735.25, "completions/mean_terminated_length": 743.9683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.128, "grad_norm": 0.028242425993084908, "kl": 0.1798095703125, "learning_rate": 2.222222222222222e-06, "loss": -0.0093, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020412635058164597, "mask/share_reasoning": 0.8511585593223572, "mask/share_step_conf": 0.11671006679534912, "num_tokens": 36551287.0, "reward": 0.6879016160964966, "reward_std": 0.3711988925933838, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7291804552078247, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.3309977650642395, "step": 120 }, { "adv/mean_abs_final_conf": 0.7510311603546143, "adv/mean_abs_reasoning": 0.5703496932983398, "adv/mean_abs_step_conf": 0.7499484419822693, "adv/ratio_final_to_reasoning": 1.3167906797869762, "adv/ratio_step_to_reasoning": 1.3148923384972955, "adv/std_final_conf": 0.9112793803215027, "adv/std_reasoning": 0.8101462125778198, "adv/std_step_conf": 0.936364471912384, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6045390070921985, "calib/avg_num_step_conf": 7.49609375, "calib/ece": 0.2352868852459017, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4918032786885246, "calib/gap": 0.12740567375886536, "calib/mean_conf": 0.7654508196721314, "calib/mu_c": 0.8145333333333333, "calib/mu_w": 0.687127659574468, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1929918032786886, "calib/std_conf": 0.27767957127344967, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8997903930131003, "calib/step_q_c_n": 1145.0, "calib/step_q_gap": 0.002245173374857301, "calib/step_q_w": 0.897545219638243, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 786.95703125, "completions/mean_terminated_length": 809.0802612304688, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.12906666666666666, "grad_norm": 0.030293744057416916, "kl": 0.165740966796875, "learning_rate": 2.1944444444444445e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01884043961763382, "mask/share_reasoning": 0.8479143381118774, "mask/share_step_conf": 0.10590147972106934, "num_tokens": 36857804.0, "reward": 0.6501889228820801, "reward_std": 0.3920496702194214, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6897769570350647, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.3027883470058441, "step": 121 }, { "adv/mean_abs_final_conf": 0.7212374210357666, "adv/mean_abs_reasoning": 0.4761502146720886, "adv/mean_abs_step_conf": 0.7682005167007446, "adv/ratio_final_to_reasoning": 1.5147266530846, "adv/ratio_step_to_reasoning": 1.6133574931385528, "adv/std_final_conf": 0.9036675095558167, "adv/std_reasoning": 0.7393293380737305, "adv/std_step_conf": 0.9357211589813232, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6959190672153635, "calib/avg_num_step_conf": 7.89453125, "calib/ece": 0.1980555555555555, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.626984126984127, "calib/gap": 0.19480246913580235, "calib/mean_conf": 0.8317857142857144, "calib/mu_c": 0.901358024691358, "calib/mu_w": 0.7065555555555556, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19349206349206344, "calib/std_conf": 0.24911029326107098, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9024658634538152, "calib/step_q_c_n": 1245.0, "calib/step_q_gap": 0.005571533556907982, "calib/step_q_w": 0.8968943298969072, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 721.3984375, "completions/mean_terminated_length": 729.9525756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.13013333333333332, "grad_norm": 0.024334616959095, "kl": 0.19036865234375, "learning_rate": 2.166666666666667e-06, "loss": -0.0019, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02030223235487938, "mask/share_reasoning": 0.8518544435501099, "mask/share_step_conf": 0.11612460017204285, "num_tokens": 37149826.0, "reward": 0.7178915739059448, "reward_std": 0.34284746646881104, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7501226663589478, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.36300432682037354, "step": 122 }, { "adv/mean_abs_final_conf": 0.7650114297866821, "adv/mean_abs_reasoning": 0.5763481259346008, "adv/mean_abs_step_conf": 0.7651894092559814, "adv/ratio_final_to_reasoning": 1.3273426170097224, "adv/ratio_step_to_reasoning": 1.3276514225064189, "adv/std_final_conf": 0.8993167281150818, "adv/std_reasoning": 0.8100424408912659, "adv/std_step_conf": 0.9361500144004822, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5572519083969466, "calib/avg_num_step_conf": 8.13671875, "calib/ece": 0.2889754098360656, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.46311475409836067, "calib/gap": 0.06475511720597171, "calib/mean_conf": 0.7262704918032789, "calib/mu_c": 0.7562595419847328, "calib/mu_w": 0.6915044247787611, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23918032786885252, "calib/std_conf": 0.30409571769873533, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8969036027263877, "calib/step_q_c_n": 1027.0, "calib/step_q_gap": 0.007007769393054364, "calib/step_q_w": 0.8898958333333333, "calib/step_q_w_n": 1056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 789.74609375, "completions/mean_terminated_length": 815.2217407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.1312, "grad_norm": 0.034101828932762146, "kl": 0.1665191650390625, "learning_rate": 2.138888888888889e-06, "loss": -0.1115, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018441390246152878, "mask/share_reasoning": 0.8417726159095764, "mask/share_step_conf": 0.1085360199213028, "num_tokens": 37457289.0, "reward": 0.5628241896629333, "reward_std": 0.39414483308792114, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6245074272155762, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.20817218720912933, "step": 123 }, { "adv/mean_abs_final_conf": 0.7271085977554321, "adv/mean_abs_reasoning": 0.46908581256866455, "adv/mean_abs_step_conf": 0.7774999737739563, "adv/ratio_final_to_reasoning": 1.5500545492387885, "adv/ratio_step_to_reasoning": 1.6574791923815564, "adv/std_final_conf": 0.9254276156425476, "adv/std_reasoning": 0.7206630706787109, "adv/std_step_conf": 0.9363216161727905, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.680831265508685, "calib/avg_num_step_conf": 8.03515625, "calib/ece": 0.19795180722891564, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5622489959839357, "calib/gap": 0.18590777502067835, "calib/mean_conf": 0.7848594377510041, "calib/mu_c": 0.8542948717948718, "calib/mu_w": 0.6683870967741935, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17815261044176706, "calib/std_conf": 0.2836779388908106, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9042563681183238, "calib/step_q_c_n": 1217.0, "calib/step_q_gap": 0.030018272880228403, "calib/step_q_w": 0.8742380952380954, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 761.8359375, "completions/mean_terminated_length": 777.011962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.13226666666666667, "grad_norm": 0.028144368901848793, "kl": 0.176025390625, "learning_rate": 2.1111111111111114e-06, "loss": 0.0094, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019312703981995583, "mask/share_reasoning": 0.8514062762260437, "mask/share_step_conf": 0.10974974930286407, "num_tokens": 37759135.0, "reward": 0.7117006778717041, "reward_std": 0.3516343832015991, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.726138710975647, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.38163769245147705, "step": 124 }, { "adv/mean_abs_final_conf": 0.7411487102508545, "adv/mean_abs_reasoning": 0.4857451319694519, "adv/mean_abs_step_conf": 0.7906614542007446, "adv/ratio_final_to_reasoning": 1.5257975046417238, "adv/ratio_step_to_reasoning": 1.6277290335263075, "adv/std_final_conf": 0.9077005386352539, "adv/std_reasoning": 0.7575331330299377, "adv/std_step_conf": 0.9356077313423157, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6455156371155337, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.26476190476190475, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5793650793650794, "calib/gap": 0.1416024812613078, "calib/mean_conf": 0.7861904761904762, "calib/mu_c": 0.8457534246575342, "calib/mu_w": 0.7041509433962264, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2357936507936508, "calib/std_conf": 0.2885386198542446, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9060358744394619, "calib/step_q_c_n": 1115.0, "calib/step_q_gap": 0.007401143900539586, "calib/step_q_w": 0.8986347305389223, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 802.55078125, "completions/mean_terminated_length": 808.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.13333333333333333, "grad_norm": 0.026144176721572876, "kl": 0.166656494140625, "learning_rate": 2.0833333333333334e-06, "loss": 0.0016, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019170530140399933, "mask/share_reasoning": 0.8632123470306396, "mask/share_step_conf": 0.10980463027954102, "num_tokens": 38069396.0, "reward": 0.6401241421699524, "reward_std": 0.35746702551841736, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6850773692131042, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.28501462936401367, "step": 125 }, { "adv/mean_abs_final_conf": 0.6534983515739441, "adv/mean_abs_reasoning": 0.4111436605453491, "adv/mean_abs_step_conf": 0.7711385488510132, "adv/ratio_final_to_reasoning": 1.5894647401522157, "adv/ratio_step_to_reasoning": 1.8755939172895422, "adv/std_final_conf": 0.8342850804328918, "adv/std_reasoning": 0.7013512253761292, "adv/std_step_conf": 0.9359316229820251, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7610365302697819, "calib/avg_num_step_conf": 8.12890625, "calib/ece": 0.27228915662650605, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5983935742971888, "calib/gap": 0.29177423518781487, "calib/mean_conf": 0.7622489959839358, "calib/mu_c": 0.9110655737704918, "calib/mu_w": 0.6192913385826769, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27228915662650605, "calib/std_conf": 0.3206377966421584, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9034669338677356, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.023679306905593167, "calib/step_q_w": 0.8797876269621424, "calib/step_q_w_n": 1083.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 784.76953125, "completions/mean_terminated_length": 797.2262573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.1344, "grad_norm": 0.027145879343152046, "kl": 0.16473388671875, "learning_rate": 2.0555555555555555e-06, "loss": -0.0515, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019449051469564438, "mask/share_reasoning": 0.8491480350494385, "mask/share_step_conf": 0.11577790975570679, "num_tokens": 38375761.0, "reward": 0.5556277632713318, "reward_std": 0.29540807008743286, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6993195414543152, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.12209223210811615, "step": 126 }, { "adv/mean_abs_final_conf": 0.7096794247627258, "adv/mean_abs_reasoning": 0.45027339458465576, "adv/mean_abs_step_conf": 0.7957907915115356, "adv/ratio_final_to_reasoning": 1.5761078342577917, "adv/ratio_step_to_reasoning": 1.7673502389489266, "adv/std_final_conf": 0.8831758499145508, "adv/std_reasoning": 0.7207463979721069, "adv/std_step_conf": 0.9359408020973206, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7281753965080635, "calib/avg_num_step_conf": 7.984375, "calib/ece": 0.285469387755102, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6326530612244898, "calib/gap": 0.255379181660669, "calib/mean_conf": 0.7834285714285716, "calib/mu_c": 0.9116393442622951, "calib/mu_w": 0.6562601626016261, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.285469387755102, "calib/std_conf": 0.31052503267589776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9082281059063136, "calib/step_q_c_n": 982.0, "calib/step_q_gap": 0.024923021160550896, "calib/step_q_w": 0.8833050847457627, "calib/step_q_w_n": 1062.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 784.03125, "completions/mean_terminated_length": 793.328125, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.13546666666666668, "grad_norm": 0.02291964553296566, "kl": 0.1801605224609375, "learning_rate": 2.027777777777778e-06, "loss": 0.0462, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019718363881111145, "mask/share_reasoning": 0.8533198237419128, "mask/share_step_conf": 0.1152430847287178, "num_tokens": 38680145.0, "reward": 0.5593506097793579, "reward_std": 0.35522782802581787, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6697046756744385, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.16227777302265167, "step": 127 }, { "adv/mean_abs_final_conf": 0.7020530700683594, "adv/mean_abs_reasoning": 0.5561271905899048, "adv/mean_abs_step_conf": 0.7891441583633423, "adv/ratio_final_to_reasoning": 1.2623965919085265, "adv/ratio_step_to_reasoning": 1.4189994154507493, "adv/std_final_conf": 0.8692341446876526, "adv/std_reasoning": 0.775589644908905, "adv/std_step_conf": 0.9360777735710144, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6191236413043478, "calib/avg_num_step_conf": 7.87109375, "calib/ece": 0.3292181069958847, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6255144032921811, "calib/gap": 0.12339470108695638, "calib/mean_conf": 0.794650205761317, "calib/mu_c": 0.853046875, "calib/mu_w": 0.7296521739130436, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.298559670781893, "calib/std_conf": 0.29337854076085734, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9101501501501501, "calib/step_q_c_n": 999.0, "calib/step_q_gap": 0.02262751235487459, "calib/step_q_w": 0.8875226377952755, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 762.4453125, "completions/mean_terminated_length": 783.8795166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.13653333333333334, "grad_norm": 0.023058714345097542, "kl": 0.17486572265625, "learning_rate": 2.0000000000000003e-06, "loss": -0.0412, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.019529111683368683, "mask/share_reasoning": 0.8425556421279907, "mask/share_step_conf": 0.1105714738368988, "num_tokens": 38981995.0, "reward": 0.5060725808143616, "reward_std": 0.402359277009964, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.617258608341217, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.10582408308982849, "step": 128 }, { "adv/mean_abs_final_conf": 0.5893697142601013, "adv/mean_abs_reasoning": 0.4902503490447998, "adv/mean_abs_step_conf": 0.7598936557769775, "adv/ratio_final_to_reasoning": 1.2021811211525397, "adv/ratio_step_to_reasoning": 1.55001145283741, "adv/std_final_conf": 0.8214588761329651, "adv/std_reasoning": 0.7574241161346436, "adv/std_step_conf": 0.935619592666626, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6773800105485231, "calib/avg_num_step_conf": 8.30078125, "calib/ece": 0.2732677165354331, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7480314960629921, "calib/gap": 0.1619712552742616, "calib/mean_conf": 0.8681496062992126, "calib/mu_c": 0.929367088607595, "calib/mu_w": 0.7673958333333334, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25968503937007875, "calib/std_conf": 0.24575955148379136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9087342814371258, "calib/step_q_c_n": 1336.0, "calib/step_q_gap": 0.0032463219947935773, "calib/step_q_w": 0.9054879594423322, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 724.203125, "completions/mean_terminated_length": 727.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.1376, "grad_norm": 0.020458418875932693, "kl": 0.17547607421875, "learning_rate": 1.9722222222222224e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020674947649240494, "mask/share_reasoning": 0.8490486741065979, "mask/share_step_conf": 0.12637010216712952, "num_tokens": 39269775.0, "reward": 0.6816619634628296, "reward_std": 0.3260408043861389, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7115363478660583, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3306938707828522, "step": 129 }, { "adv/mean_abs_final_conf": 0.6116889715194702, "adv/mean_abs_reasoning": 0.32955822348594666, "adv/mean_abs_step_conf": 0.7390551567077637, "adv/ratio_final_to_reasoning": 1.8560877196425185, "adv/ratio_step_to_reasoning": 2.24256323781063, "adv/std_final_conf": 0.8465979099273682, "adv/std_reasoning": 0.6402920484542847, "adv/std_step_conf": 0.9348527193069458, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6206658854346176, "calib/avg_num_step_conf": 7.921875, "calib/ece": 0.2692063492063493, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7936507936507936, "calib/gap": 0.1499165919900739, "calib/mean_conf": 0.8553968253968254, "calib/mu_c": 0.9083435582822086, "calib/mu_w": 0.7584269662921347, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23888888888888898, "calib/std_conf": 0.28490502676442225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9076360808709176, "calib/step_q_c_n": 1286.0, "calib/step_q_gap": -0.0015957250590016292, "calib/step_q_w": 0.9092318059299193, "calib/step_q_w_n": 742.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 690.96484375, "completions/mean_terminated_length": 693.674560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.13866666666666666, "grad_norm": 0.03185328468680382, "kl": 0.1677703857421875, "learning_rate": 1.944444444444445e-06, "loss": -0.0024, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02168242260813713, "mask/share_reasoning": 0.8475232124328613, "mask/share_step_conf": 0.12688809633255005, "num_tokens": 39551950.0, "reward": 0.7144206166267395, "reward_std": 0.25448256731033325, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7040468454360962, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.40135687589645386, "step": 130 }, { "adv/mean_abs_final_conf": 0.6819969415664673, "adv/mean_abs_reasoning": 0.4341234564781189, "adv/mean_abs_step_conf": 0.7591865658760071, "adv/ratio_final_to_reasoning": 1.5709746418662867, "adv/ratio_step_to_reasoning": 1.7487803401249118, "adv/std_final_conf": 0.8898735046386719, "adv/std_reasoning": 0.720470130443573, "adv/std_step_conf": 0.9359843730926514, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6664562848790858, "calib/avg_num_step_conf": 8.0703125, "calib/ece": 0.3766935483870968, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6451612903225806, "calib/gap": 0.20516077597661453, "calib/mean_conf": 0.7817741935483871, "calib/mu_c": 0.8992452830188681, "calib/mu_w": 0.6940845070422536, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36552419354838717, "calib/std_conf": 0.31874008115395064, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9083021077283373, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.005991876705235044, "calib/step_q_w": 0.9023102310231023, "calib/step_q_w_n": 1212.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 746.39453125, "completions/mean_terminated_length": 761.2630004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.13973333333333332, "grad_norm": 0.038372308015823364, "kl": 0.167083740234375, "learning_rate": 1.916666666666667e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019734080880880356, "mask/share_reasoning": 0.8474262952804565, "mask/share_step_conf": 0.113308385014534, "num_tokens": 39849235.0, "reward": 0.44620394706726074, "reward_std": 0.3412773311138153, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.6088827848434448, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.006962493062019348, "step": 131 }, { "adv/mean_abs_final_conf": 0.6559932231903076, "adv/mean_abs_reasoning": 0.5363863706588745, "adv/mean_abs_step_conf": 0.787875235080719, "adv/ratio_final_to_reasoning": 1.2229863752587768, "adv/ratio_step_to_reasoning": 1.46885767084821, "adv/std_final_conf": 0.8319321274757385, "adv/std_reasoning": 0.775436282157898, "adv/std_step_conf": 0.9356468319892883, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6849190938511327, "calib/avg_num_step_conf": 7.95703125, "calib/ece": 0.26268774703557296, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6996047430830039, "calib/gap": 0.21572362459546934, "calib/mean_conf": 0.810909090909091, "calib/mu_c": 0.8987333333333334, "calib/mu_w": 0.683009708737864, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24035573122529633, "calib/std_conf": 0.30704051379424063, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9063180272108845, "calib/step_q_c_n": 1176.0, "calib/step_q_gap": 0.016527086444333983, "calib/step_q_w": 0.8897909407665505, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 740.9375, "completions/mean_terminated_length": 746.7716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.1408, "grad_norm": 0.019422784447669983, "kl": 0.172088623046875, "learning_rate": 1.888888888888889e-06, "loss": 0.013, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020528219640254974, "mask/share_reasoning": 0.8521432876586914, "mask/share_step_conf": 0.11951600760221481, "num_tokens": 40144507.0, "reward": 0.6724216341972351, "reward_std": 0.36871108412742615, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7125101685523987, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.31748926639556885, "step": 132 }, { "adv/mean_abs_final_conf": 0.7555452585220337, "adv/mean_abs_reasoning": 0.5980292558670044, "adv/mean_abs_step_conf": 0.7885544896125793, "adv/ratio_final_to_reasoning": 1.263391800835341, "adv/ratio_step_to_reasoning": 1.3185884835506205, "adv/std_final_conf": 0.9083207845687866, "adv/std_reasoning": 0.8267114162445068, "adv/std_step_conf": 0.9366989135742188, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6685675182481752, "calib/avg_num_step_conf": 8.1484375, "calib/ece": 0.3265261044176707, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6345381526104418, "calib/gap": 0.22614735401459862, "calib/mean_conf": 0.7602610441767068, "calib/mu_c": 0.8846875000000001, "calib/mu_w": 0.6585401459854014, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3184939759036145, "calib/std_conf": 0.3388102739520392, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9094806629834254, "calib/step_q_c_n": 905.0, "calib/step_q_gap": 0.011123338681986006, "calib/step_q_w": 0.8983573243014394, "calib/step_q_w_n": 1181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 835.5390625, "completions/mean_terminated_length": 845.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 318.0, "epoch": 0.14186666666666667, "grad_norm": 0.033650532364845276, "kl": 0.148590087890625, "learning_rate": 1.8611111111111113e-06, "loss": 0.031, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018224865198135376, "mask/share_reasoning": 0.8637900948524475, "mask/share_step_conf": 0.10626627504825592, "num_tokens": 40464749.0, "reward": 0.49485090374946594, "reward_std": 0.4596877694129944, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6354119777679443, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.07225852459669113, "step": 133 }, { "adv/mean_abs_final_conf": 0.7146537899971008, "adv/mean_abs_reasoning": 0.5521727800369263, "adv/mean_abs_step_conf": 0.7653998136520386, "adv/ratio_final_to_reasoning": 1.294257550959518, "adv/ratio_step_to_reasoning": 1.386159987098337, "adv/std_final_conf": 0.8777416944503784, "adv/std_reasoning": 0.7928855419158936, "adv/std_step_conf": 0.9364771246910095, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6981589147286822, "calib/avg_num_step_conf": 7.3671875, "calib/ece": 0.24196787148594356, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5662650602409639, "calib/gap": 0.266874031007752, "calib/mean_conf": 0.7275100401606427, "calib/mu_c": 0.8561240310077519, "calib/mu_w": 0.5892499999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2257028112449797, "calib/std_conf": 0.3459951959233053, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9080306230200634, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.013051922274589511, "calib/step_q_w": 0.8949787007454739, "calib/step_q_w_n": 939.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 848.98046875, "completions/mean_terminated_length": 855.6653442382812, "completions/min_length": 0.0, "completions/min_terminated_length": 386.0, "epoch": 0.14293333333333333, "grad_norm": 0.02200193516910076, "kl": 0.155487060546875, "learning_rate": 1.8333333333333333e-06, "loss": -0.0334, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017995350062847137, "mask/share_reasoning": 0.8758010268211365, "mask/share_step_conf": 0.09839113801717758, "num_tokens": 40791040.0, "reward": 0.5666602849960327, "reward_std": 0.4101633131504059, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6965745687484741, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.14221477508544922, "step": 134 }, { "adv/mean_abs_final_conf": 0.7415114045143127, "adv/mean_abs_reasoning": 0.5965020060539246, "adv/mean_abs_step_conf": 0.7876029014587402, "adv/ratio_final_to_reasoning": 1.2430995989765024, "adv/ratio_step_to_reasoning": 1.3203692417885011, "adv/std_final_conf": 0.8925070762634277, "adv/std_reasoning": 0.8268001079559326, "adv/std_step_conf": 0.936475396156311, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6228057656629085, "calib/avg_num_step_conf": 8.1484375, "calib/ece": 0.2920539419087136, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6556016597510373, "calib/gap": 0.1683380904809476, "calib/mean_conf": 0.7717219917012449, "calib/mu_c": 0.8401748251748252, "calib/mu_w": 0.6718367346938776, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23520746887966795, "calib/std_conf": 0.3404964058245485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9003955288048152, "calib/step_q_c_n": 1163.0, "calib/step_q_gap": 0.000774727071337411, "calib/step_q_w": 0.8996208017334778, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 756.12109375, "completions/mean_terminated_length": 786.857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 318.0, "epoch": 0.144, "grad_norm": 0.03226546198129654, "kl": 0.165618896484375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0553, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018887124955654144, "mask/share_reasoning": 0.8286978006362915, "mask/share_step_conf": 0.11335258930921555, "num_tokens": 41090487.0, "reward": 0.6198797225952148, "reward_std": 0.39496392011642456, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6516416668891907, "rewards/format_reward_step": 0.94140625, "rewards/step_correlation_reward": 0.28811773657798767, "step": 135 }, { "adv/mean_abs_final_conf": 0.6814755797386169, "adv/mean_abs_reasoning": 0.5314260721206665, "adv/mean_abs_step_conf": 0.7463881969451904, "adv/ratio_final_to_reasoning": 1.28235255191597, "adv/ratio_step_to_reasoning": 1.4045005243470898, "adv/std_final_conf": 0.8753324747085571, "adv/std_reasoning": 0.7927984595298767, "adv/std_step_conf": 0.9362938404083252, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6861426100326735, "calib/avg_num_step_conf": 8.0390625, "calib/ece": 0.2838400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.596, "calib/gap": 0.22379012108398988, "calib/mean_conf": 0.7393600000000001, "calib/mu_c": 0.8476744186046511, "calib/mu_w": 0.6238842975206612, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2536000000000001, "calib/std_conf": 0.352307238642637, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8973889392565729, "calib/step_q_c_n": 1103.0, "calib/step_q_gap": 0.023891557057620028, "calib/step_q_w": 0.8734973821989529, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 763.12109375, "completions/mean_terminated_length": 772.1699829101562, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.14506666666666668, "grad_norm": 0.021460261195898056, "kl": 0.167144775390625, "learning_rate": 1.777777777777778e-06, "loss": -0.0334, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019838891923427582, "mask/share_reasoning": 0.8512438535690308, "mask/share_step_conf": 0.11719851195812225, "num_tokens": 41394334.0, "reward": 0.5539348125457764, "reward_std": 0.3586582541465759, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.671519935131073, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.14103710651397705, "step": 136 }, { "adv/mean_abs_final_conf": 0.6621338725090027, "adv/mean_abs_reasoning": 0.46420344710350037, "adv/mean_abs_step_conf": 0.7904796600341797, "adv/ratio_final_to_reasoning": 1.4263872374075048, "adv/ratio_step_to_reasoning": 1.7028733090341135, "adv/std_final_conf": 0.8399810791015625, "adv/std_reasoning": 0.7393754720687866, "adv/std_step_conf": 0.9360789656639099, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7326352415966386, "calib/avg_num_step_conf": 8.20703125, "calib/ece": 0.2327016129032258, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.625, "calib/gap": 0.2830672268907565, "calib/mean_conf": 0.755766129032258, "calib/mu_c": 0.8836029411764706, "calib/mu_w": 0.6005357142857142, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22004032258064515, "calib/std_conf": 0.3423734428027801, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9086728110599079, "calib/step_q_c_n": 1085.0, "calib/step_q_gap": 0.03330863783156135, "calib/step_q_w": 0.8753641732283466, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 759.3359375, "completions/mean_terminated_length": 777.56005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.14613333333333334, "grad_norm": 0.023186106234788895, "kl": 0.162078857421875, "learning_rate": 1.75e-06, "loss": -0.0652, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01922602951526642, "mask/share_reasoning": 0.8434792757034302, "mask/share_step_conf": 0.1138571947813034, "num_tokens": 41695708.0, "reward": 0.6702703237533569, "reward_std": 0.33385735750198364, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7055327892303467, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3357889652252197, "step": 137 }, { "adv/mean_abs_final_conf": 0.6598755121231079, "adv/mean_abs_reasoning": 0.5995461344718933, "adv/mean_abs_step_conf": 0.777741551399231, "adv/ratio_final_to_reasoning": 1.1006250798436976, "adv/ratio_step_to_reasoning": 1.2972171892731825, "adv/std_final_conf": 0.86130690574646, "adv/std_reasoning": 0.8266723155975342, "adv/std_step_conf": 0.9359835386276245, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7679413498379016, "calib/avg_num_step_conf": 7.8515625, "calib/ece": 0.1534115226337449, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6090534979423868, "calib/gap": 0.3785459770114941, "calib/mean_conf": 0.7286378600823046, "calib/mu_c": 0.8641666666666666, "calib/mu_w": 0.4856206896551725, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12003703703703711, "calib/std_conf": 0.3585229766999016, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9093198380566804, "calib/step_q_c_n": 1235.0, "calib/step_q_gap": 0.034623063863131964, "calib/step_q_w": 0.8746967741935484, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 779.4453125, "completions/mean_terminated_length": 801.357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.1472, "grad_norm": 0.01940912939608097, "kl": 0.168487548828125, "learning_rate": 1.7222222222222224e-06, "loss": -0.0544, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.019332313910126686, "mask/share_reasoning": 0.8415598273277283, "mask/share_step_conf": 0.1117640882730484, "num_tokens": 41999582.0, "reward": 0.7093103528022766, "reward_std": 0.3921882212162018, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7631781101226807, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.34450504183769226, "step": 138 }, { "adv/mean_abs_final_conf": 0.589346170425415, "adv/mean_abs_reasoning": 0.378071665763855, "adv/mean_abs_step_conf": 0.7459247708320618, "adv/ratio_final_to_reasoning": 1.558821312977003, "adv/ratio_step_to_reasoning": 1.9729718949580561, "adv/std_final_conf": 0.8030606508255005, "adv/std_reasoning": 0.6613706946372986, "adv/std_step_conf": 0.9352872967720032, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.757540684624018, "calib/avg_num_step_conf": 8.3359375, "calib/ece": 0.20812000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.664, "calib/gap": 0.27966750841750854, "calib/mean_conf": 0.7945200000000001, "calib/mu_c": 0.8929629629629631, "calib/mu_w": 0.6132954545454545, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17732000000000006, "calib/std_conf": 0.3130724670104352, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9137986270022883, "calib/step_q_c_n": 1311.0, "calib/step_q_gap": 0.036860595410550756, "calib/step_q_w": 0.8769380315917376, "calib/step_q_w_n": 823.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 711.0703125, "completions/mean_terminated_length": 719.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.14826666666666666, "grad_norm": 0.0213282760232687, "kl": 0.1831817626953125, "learning_rate": 1.6944444444444446e-06, "loss": 0.0103, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02053791843354702, "mask/share_reasoning": 0.8478147983551025, "mask/share_step_conf": 0.11992855370044708, "num_tokens": 42284712.0, "reward": 0.7505369782447815, "reward_std": 0.2918218970298767, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7617222666740417, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.41747671365737915, "step": 139 }, { "adv/mean_abs_final_conf": 0.6343496441841125, "adv/mean_abs_reasoning": 0.48369088768959045, "adv/mean_abs_step_conf": 0.7830361127853394, "adv/ratio_final_to_reasoning": 1.3114773511946076, "adv/ratio_step_to_reasoning": 1.6188771232090158, "adv/std_final_conf": 0.845893919467926, "adv/std_reasoning": 0.7394121289253235, "adv/std_step_conf": 0.9360425472259521, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.710564125831821, "calib/avg_num_step_conf": 8.2734375, "calib/ece": 0.20199999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.668, "calib/gap": 0.27235329703569255, "calib/mean_conf": 0.7664000000000001, "calib/mu_c": 0.8491954022988505, "calib/mu_w": 0.5768421052631579, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13619999999999993, "calib/std_conf": 0.3427509883282614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.91, "calib/step_q_c_n": 1397.0, "calib/step_q_gap": 0.03300970873786402, "calib/step_q_w": 0.876990291262136, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 735.140625, "completions/mean_terminated_length": 743.8577270507812, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.14933333333333335, "grad_norm": 0.03776135668158531, "kl": 0.1652679443359375, "learning_rate": 1.6666666666666667e-06, "loss": 0.0303, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020328957587480545, "mask/share_reasoning": 0.8499683141708374, "mask/share_step_conf": 0.11798396706581116, "num_tokens": 42577924.0, "reward": 0.7632162570953369, "reward_std": 0.31106436252593994, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7629226446151733, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.43225979804992676, "step": 140 }, { "adv/mean_abs_final_conf": 0.572924017906189, "adv/mean_abs_reasoning": 0.39419761300086975, "adv/mean_abs_step_conf": 0.7634836435317993, "adv/ratio_final_to_reasoning": 1.4533929151542704, "adv/ratio_step_to_reasoning": 1.9368043294826212, "adv/std_final_conf": 0.7978944778442383, "adv/std_reasoning": 0.681702733039856, "adv/std_step_conf": 0.9354971647262573, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7728465083303793, "calib/avg_num_step_conf": 7.96875, "calib/ece": 0.16658536585365846, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6463414634146342, "calib/gap": 0.35866997518610433, "calib/mean_conf": 0.7498373983739838, "calib/mu_c": 0.8825161290322581, "calib/mu_w": 0.5238461538461537, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.143170731707317, "calib/std_conf": 0.3539245331800658, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9111320754716982, "calib/step_q_c_n": 1219.0, "calib/step_q_gap": 0.08411624112334248, "calib/step_q_w": 0.8270158343483557, "calib/step_q_w_n": 821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 755.140625, "completions/mean_terminated_length": 776.3694458007812, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.1504, "grad_norm": 0.02357516810297966, "kl": 0.168670654296875, "learning_rate": 1.638888888888889e-06, "loss": -0.0829, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019246093928813934, "mask/share_reasoning": 0.8440717458724976, "mask/share_step_conf": 0.1093384325504303, "num_tokens": 42878336.0, "reward": 0.7093539237976074, "reward_std": 0.2913316488265991, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.76347815990448, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.3419484794139862, "step": 141 }, { "adv/mean_abs_final_conf": 0.6818134784698486, "adv/mean_abs_reasoning": 0.5208780169487, "adv/mean_abs_step_conf": 0.7369670867919922, "adv/ratio_final_to_reasoning": 1.3089695788351898, "adv/ratio_step_to_reasoning": 1.4148554225980596, "adv/std_final_conf": 0.8804184794425964, "adv/std_reasoning": 0.792790949344635, "adv/std_step_conf": 0.9357167482376099, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7144522921108741, "calib/avg_num_step_conf": 8.21875, "calib/ece": 0.23434959349593493, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5975609756097561, "calib/gap": 0.29858208955223897, "calib/mean_conf": 0.7026422764227643, "calib/mu_c": 0.8385820895522389, "calib/mu_w": 0.5399999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19613821138211376, "calib/std_conf": 0.3787011527454767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9123612417685795, "calib/step_q_c_n": 1063.0, "calib/step_q_gap": 0.033677284035630195, "calib/step_q_w": 0.8786839577329493, "calib/step_q_w_n": 1041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 803.51953125, "completions/mean_terminated_length": 813.0474853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.15146666666666667, "grad_norm": 0.029727516695857048, "kl": 0.1576995849609375, "learning_rate": 1.6111111111111113e-06, "loss": -0.034, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01919659972190857, "mask/share_reasoning": 0.855238139629364, "mask/share_step_conf": 0.11384646594524384, "num_tokens": 43189197.0, "reward": 0.6065089702606201, "reward_std": 0.327970951795578, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6992902755737305, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.21763396263122559, "step": 142 }, { "adv/mean_abs_final_conf": 0.6424644589424133, "adv/mean_abs_reasoning": 0.4570094048976898, "adv/mean_abs_step_conf": 0.7492420077323914, "adv/ratio_final_to_reasoning": 1.4058013950199584, "adv/ratio_step_to_reasoning": 1.6394454899678121, "adv/std_final_conf": 0.8636579513549805, "adv/std_reasoning": 0.7393165230751038, "adv/std_step_conf": 0.9361722469329834, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7209853638425066, "calib/avg_num_step_conf": 7.8671875, "calib/ece": 0.2088821138211382, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5934959349593496, "calib/gap": 0.2949958771387342, "calib/mean_conf": 0.712540650406504, "calib/mu_c": 0.8312585034013605, "calib/mu_w": 0.5362626262626263, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16193089430894309, "calib/std_conf": 0.3624702796535149, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9037253218884121, "calib/step_q_c_n": 1165.0, "calib/step_q_gap": 0.01621059868464292, "calib/step_q_w": 0.8875147232037692, "calib/step_q_w_n": 849.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 779.90625, "completions/mean_terminated_length": 795.4422607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.15253333333333333, "grad_norm": 0.02792339399456978, "kl": 0.1596221923828125, "learning_rate": 1.5833333333333333e-06, "loss": 0.054, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0190900769084692, "mask/share_reasoning": 0.8507193922996521, "mask/share_step_conf": 0.11065928637981415, "num_tokens": 43496189.0, "reward": 0.6489737629890442, "reward_std": 0.2787196934223175, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7272330522537231, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.26368314027786255, "step": 143 }, { "adv/mean_abs_final_conf": 0.6125003099441528, "adv/mean_abs_reasoning": 0.43078356981277466, "adv/mean_abs_step_conf": 0.7561758160591125, "adv/ratio_final_to_reasoning": 1.4218283910186156, "adv/ratio_step_to_reasoning": 1.7553497139822638, "adv/std_final_conf": 0.8434417247772217, "adv/std_reasoning": 0.7205666303634644, "adv/std_step_conf": 0.9356434345245361, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7194492254733218, "calib/avg_num_step_conf": 8.11328125, "calib/ece": 0.19257999999999986, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.572, "calib/gap": 0.33783921399885264, "calib/mean_conf": 0.67498, "calib/mu_c": 0.7884939759036146, "calib/mu_w": 0.45065476190476195, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10177999999999988, "calib/std_conf": 0.38638025260098374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9077133907595929, "calib/step_q_c_n": 1277.0, "calib/step_q_gap": 0.0018883907595927951, "calib/step_q_w": 0.9058250000000001, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 750.02734375, "completions/mean_terminated_length": 758.9209594726562, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.1536, "grad_norm": 0.03318783640861511, "kl": 0.18798828125, "learning_rate": 1.5555555555555558e-06, "loss": 0.0073, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020067322999238968, "mask/share_reasoning": 0.85340416431427, "mask/share_step_conf": 0.11480970680713654, "num_tokens": 43792324.0, "reward": 0.7210659980773926, "reward_std": 0.2872735261917114, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7599924802780151, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3571394085884094, "step": 144 }, { "adv/mean_abs_final_conf": 0.6912169456481934, "adv/mean_abs_reasoning": 0.6103887557983398, "adv/mean_abs_step_conf": 0.7602620124816895, "adv/ratio_final_to_reasoning": 1.1324208368552542, "adv/ratio_step_to_reasoning": 1.2455373813158261, "adv/std_final_conf": 0.8755853772163391, "adv/std_reasoning": 0.826539158821106, "adv/std_step_conf": 0.9361149668693542, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6756991837280878, "calib/avg_num_step_conf": 8.44140625, "calib/ece": 0.22039525691699607, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5968379446640316, "calib/gap": 0.25610196708149313, "calib/mean_conf": 0.7237154150197629, "calib/mu_c": 0.8188679245283018, "calib/mu_w": 0.5627659574468087, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15782608695652178, "calib/std_conf": 0.36024424910245134, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.907763358778626, "calib/step_q_c_n": 1310.0, "calib/step_q_gap": 0.021899669001892663, "calib/step_q_w": 0.8858636897767334, "calib/step_q_w_n": 851.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 737.9765625, "completions/mean_terminated_length": 740.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.15466666666666667, "grad_norm": 0.03555729612708092, "kl": 0.178863525390625, "learning_rate": 1.527777777777778e-06, "loss": 0.0632, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020503666251897812, "mask/share_reasoning": 0.8497222661972046, "mask/share_step_conf": 0.1258678436279297, "num_tokens": 44083950.0, "reward": 0.6904496550559998, "reward_std": 0.34167811274528503, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7384937405586243, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3205304741859436, "step": 145 }, { "adv/mean_abs_final_conf": 0.6599738597869873, "adv/mean_abs_reasoning": 0.3847285509109497, "adv/mean_abs_step_conf": 0.7628105282783508, "adv/ratio_final_to_reasoning": 1.7154273012083956, "adv/ratio_step_to_reasoning": 1.9827239919475406, "adv/std_final_conf": 0.85041743516922, "adv/std_reasoning": 0.6613991856575012, "adv/std_step_conf": 0.9356829524040222, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.725688924218336, "calib/avg_num_step_conf": 8.41015625, "calib/ece": 0.24115384615384622, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5384615384615384, "calib/gap": 0.3158220720720719, "calib/mean_conf": 0.6631781376518217, "calib/mu_c": 0.8370720720720719, "calib/mu_w": 0.52125, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22746963562753042, "calib/std_conf": 0.3803313607572731, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075369978858352, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.015275191754932083, "calib/step_q_w": 0.8922618061309031, "calib/step_q_w_n": 1207.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 777.671875, "completions/mean_terminated_length": 793.1633911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.15573333333333333, "grad_norm": 0.046256691217422485, "kl": 0.169403076171875, "learning_rate": 1.5e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019423943012952805, "mask/share_reasoning": 0.8471709489822388, "mask/share_step_conf": 0.11387380957603455, "num_tokens": 44390250.0, "reward": 0.47675973176956177, "reward_std": 0.30644434690475464, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6932385563850403, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": -0.019406616687774658, "step": 146 }, { "adv/mean_abs_final_conf": 0.6734507083892822, "adv/mean_abs_reasoning": 0.316013365983963, "adv/mean_abs_step_conf": 0.7818354368209839, "adv/ratio_final_to_reasoning": 2.1310829885070697, "adv/ratio_step_to_reasoning": 2.474058128480111, "adv/std_final_conf": 0.861054003238678, "adv/std_reasoning": 0.6402580142021179, "adv/std_step_conf": 0.9357953071594238, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6833056754065715, "calib/avg_num_step_conf": 8.38671875, "calib/ece": 0.30921951219512195, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6300813008130082, "calib/gap": 0.2612223033521406, "calib/mean_conf": 0.7102764227642276, "calib/mu_c": 0.8493826086956521, "calib/mu_w": 0.5881603053435115, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2760081300813008, "calib/std_conf": 0.3834393095109842, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9026067761806983, "calib/step_q_c_n": 974.0, "calib/step_q_gap": -0.0051928828133340765, "calib/step_q_w": 0.9077996589940324, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 789.7265625, "completions/mean_terminated_length": 799.0909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.1568, "grad_norm": 0.03732088953256607, "kl": 0.164581298828125, "learning_rate": 1.4722222222222225e-06, "loss": 0.0063, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01913021132349968, "mask/share_reasoning": 0.8517061471939087, "mask/share_step_conf": 0.11744489520788193, "num_tokens": 44696100.0, "reward": 0.4662114679813385, "reward_std": 0.27114027738571167, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6487675905227661, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.00162409245967865, "step": 147 }, { "adv/mean_abs_final_conf": 0.6306105852127075, "adv/mean_abs_reasoning": 0.5097656846046448, "adv/mean_abs_step_conf": 0.7392554879188538, "adv/ratio_final_to_reasoning": 1.2370597006775486, "adv/ratio_step_to_reasoning": 1.4501868412193981, "adv/std_final_conf": 0.8591131567955017, "adv/std_reasoning": 0.7752926349639893, "adv/std_step_conf": 0.9363844394683838, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7374319361782955, "calib/avg_num_step_conf": 7.99609375, "calib/ece": 0.1976470588235294, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6313725490196078, "calib/gap": 0.3357699126250474, "calib/mean_conf": 0.7266666666666667, "calib/mu_c": 0.866241610738255, "calib/mu_w": 0.5304716981132076, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16999999999999998, "calib/std_conf": 0.3668021140201801, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9106073211314477, "calib/step_q_c_n": 1202.0, "calib/step_q_gap": 0.029175368468725704, "calib/step_q_w": 0.881431952662722, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 711.46484375, "completions/mean_terminated_length": 714.2549438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.15786666666666666, "grad_norm": 0.03166365996003151, "kl": 0.1800537109375, "learning_rate": 1.4444444444444445e-06, "loss": 0.0061, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.021450942382216454, "mask/share_reasoning": 0.8483710289001465, "mask/share_step_conf": 0.1262717992067337, "num_tokens": 44983347.0, "reward": 0.6800941228866577, "reward_std": 0.3421480059623718, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7624218463897705, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.28214138746261597, "step": 148 }, { "adv/mean_abs_final_conf": 0.6537437438964844, "adv/mean_abs_reasoning": 0.5020466446876526, "adv/mean_abs_step_conf": 0.7768505811691284, "adv/ratio_final_to_reasoning": 1.3021573808210787, "adv/ratio_step_to_reasoning": 1.547367340045554, "adv/std_final_conf": 0.8794245719909668, "adv/std_reasoning": 0.7576271891593933, "adv/std_step_conf": 0.9356343746185303, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7993790064102564, "calib/avg_num_step_conf": 7.71875, "calib/ece": 0.12939516129032252, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5685483870967742, "calib/gap": 0.45433226495726514, "calib/mean_conf": 0.662459677419355, "calib/mu_c": 0.8529861111111112, "calib/mu_w": 0.3986538461538461, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10560483870967735, "calib/std_conf": 0.3929397043788211, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.910442238267148, "calib/step_q_c_n": 1108.0, "calib/step_q_gap": 0.051386938727977394, "calib/step_q_w": 0.8590552995391706, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 772.22265625, "completions/mean_terminated_length": 787.6055908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.15893333333333334, "grad_norm": 0.02984035573899746, "kl": 0.157073974609375, "learning_rate": 1.4166666666666667e-06, "loss": -0.0238, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019209206104278564, "mask/share_reasoning": 0.8516172170639038, "mask/share_step_conf": 0.10964234173297882, "num_tokens": 45285492.0, "reward": 0.6858113408088684, "reward_std": 0.32667478919029236, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7875117063522339, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.27942347526550293, "step": 149 }, { "adv/mean_abs_final_conf": 0.65451580286026, "adv/mean_abs_reasoning": 0.52001953125, "adv/mean_abs_step_conf": 0.7654784917831421, "adv/ratio_final_to_reasoning": 1.2586369617444249, "adv/ratio_step_to_reasoning": 1.4720187334947183, "adv/std_final_conf": 0.8596464395523071, "adv/std_reasoning": 0.7754071354866028, "adv/std_step_conf": 0.9360762238502502, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7629945181956277, "calib/avg_num_step_conf": 8.28515625, "calib/ece": 0.18471999999999988, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.636, "calib/gap": 0.3906043193976619, "calib/mean_conf": 0.71152, "calib/mu_c": 0.8724489795918368, "calib/mu_w": 0.4818446601941749, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1541199999999999, "calib/std_conf": 0.37992432088509415, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9113223140495869, "calib/step_q_c_n": 1210.0, "calib/step_q_gap": 0.05469223721094785, "calib/step_q_w": 0.856630076838639, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 729.2734375, "completions/mean_terminated_length": 735.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.16, "grad_norm": 0.02691521868109703, "kl": 0.175537109375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0028, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.021217215806245804, "mask/share_reasoning": 0.8411681056022644, "mask/share_step_conf": 0.12980221211910248, "num_tokens": 45577146.0, "reward": 0.6711744666099548, "reward_std": 0.3526858687400818, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.765114426612854, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2686406373977661, "step": 150 }, { "adv/mean_abs_final_conf": 0.7199978232383728, "adv/mean_abs_reasoning": 0.550804853439331, "adv/mean_abs_step_conf": 0.756540060043335, "adv/ratio_final_to_reasoning": 1.307174072164703, "adv/ratio_step_to_reasoning": 1.3735174178647007, "adv/std_final_conf": 0.8927524089813232, "adv/std_reasoning": 0.7930209040641785, "adv/std_step_conf": 0.9363460540771484, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7288725258393569, "calib/avg_num_step_conf": 8.33984375, "calib/ece": 0.21118852459016396, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.42213114754098363, "calib/gap": 0.30035195568465867, "calib/mean_conf": 0.5674180327868852, "calib/mu_c": 0.7286725663716815, "calib/mu_w": 0.4283206106870229, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15774590163934427, "calib/std_conf": 0.39256550753677466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8955125523012553, "calib/step_q_c_n": 956.0, "calib/step_q_gap": 0.04628439284408803, "calib/step_q_w": 0.8492281594571672, "calib/step_q_w_n": 1179.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 780.08203125, "completions/mean_terminated_length": 798.8040161132812, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.16106666666666666, "grad_norm": 0.02839108370244503, "kl": 0.1691131591796875, "learning_rate": 1.3611111111111112e-06, "loss": -0.0788, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019112911075353622, "mask/share_reasoning": 0.8431797027587891, "mask/share_step_conf": 0.11426986753940582, "num_tokens": 45883871.0, "reward": 0.5250996947288513, "reward_std": 0.3253830671310425, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7012449502944946, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.07004818320274353, "step": 151 }, { "adv/mean_abs_final_conf": 0.7303016781806946, "adv/mean_abs_reasoning": 0.5549091696739197, "adv/mean_abs_step_conf": 0.7628487348556519, "adv/ratio_final_to_reasoning": 1.3160742660097697, "adv/ratio_step_to_reasoning": 1.3747272104080084, "adv/std_final_conf": 0.9063445925712585, "adv/std_reasoning": 0.8098983764648438, "adv/std_step_conf": 0.9365310668945312, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7526132404181185, "calib/avg_num_step_conf": 8.62890625, "calib/ece": 0.19837349397590362, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.46184738955823296, "calib/gap": 0.3666753774680604, "calib/mean_conf": 0.5768473895582329, "calib/mu_c": 0.7579761904761905, "calib/mu_w": 0.3913008130081301, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13459839357429718, "calib/std_conf": 0.4078563324934811, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9114836795252227, "calib/step_q_c_n": 1011.0, "calib/step_q_gap": 0.06239352927480535, "calib/step_q_w": 0.8490901502504173, "calib/step_q_w_n": 1198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 772.8046875, "completions/mean_terminated_length": 781.9683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.16213333333333332, "grad_norm": 0.035195719450712204, "kl": 0.16558837890625, "learning_rate": 1.3333333333333334e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020163387060165405, "mask/share_reasoning": 0.8471139669418335, "mask/share_step_conf": 0.12100391089916229, "num_tokens": 46187101.0, "reward": 0.6216672658920288, "reward_std": 0.3602669835090637, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7372526526451111, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.21389436721801758, "step": 152 }, { "adv/mean_abs_final_conf": 0.7047197222709656, "adv/mean_abs_reasoning": 0.3971008062362671, "adv/mean_abs_step_conf": 0.7533491849899292, "adv/ratio_final_to_reasoning": 1.7746620283910262, "adv/ratio_step_to_reasoning": 1.8971232824485917, "adv/std_final_conf": 0.8927444219589233, "adv/std_reasoning": 0.7014403343200684, "adv/std_step_conf": 0.9356233477592468, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6776244588744588, "calib/avg_num_step_conf": 7.734375, "calib/ece": 0.27427999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.496, "calib/gap": 0.24861471861471868, "calib/mean_conf": 0.60148, "calib/mu_c": 0.696948051948052, "calib/mu_w": 0.4483333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12987999999999997, "calib/std_conf": 0.4081673793923272, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9095986895986897, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": 0.035152049282484166, "calib/step_q_w": 0.8744466403162056, "calib/step_q_w_n": 759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 777.4921875, "completions/mean_terminated_length": 783.6141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 324.0, "epoch": 0.1632, "grad_norm": 0.04205191507935524, "kl": 0.1555328369140625, "learning_rate": 1.3055555555555556e-06, "loss": -0.0431, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019509706646203995, "mask/share_reasoning": 0.8606699705123901, "mask/share_step_conf": 0.11200778931379318, "num_tokens": 46493459.0, "reward": 0.6556740999221802, "reward_std": 0.2867233157157898, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6975207328796387, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2982025146484375, "step": 153 }, { "adv/mean_abs_final_conf": 0.6398210525512695, "adv/mean_abs_reasoning": 0.4637221693992615, "adv/mean_abs_step_conf": 0.7866086959838867, "adv/ratio_final_to_reasoning": 1.3797508395601163, "adv/ratio_step_to_reasoning": 1.6962930562558942, "adv/std_final_conf": 0.8462924957275391, "adv/std_reasoning": 0.739355206489563, "adv/std_step_conf": 0.9352414011955261, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7172987616099071, "calib/avg_num_step_conf": 8.08984375, "calib/ece": 0.21296, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.35551212590299275, "calib/mean_conf": 0.59296, "calib/mu_c": 0.7550735294117646, "calib/mu_w": 0.3995614035087719, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13096000000000002, "calib/std_conf": 0.41283318471266334, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9078578199052132, "calib/step_q_c_n": 1055.0, "calib/step_q_gap": 0.03562356793670918, "calib/step_q_w": 0.872234251968504, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 761.72265625, "completions/mean_terminated_length": 770.7549438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.16426666666666667, "grad_norm": 0.030612580478191376, "kl": 0.184112548828125, "learning_rate": 1.2777777777777779e-06, "loss": -0.0058, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019752800464630127, "mask/share_reasoning": 0.8524797558784485, "mask/share_step_conf": 0.11604867875576019, "num_tokens": 46792900.0, "reward": 0.6149193048477173, "reward_std": 0.31094467639923096, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7377804517745972, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.190495565533638, "step": 154 }, { "adv/mean_abs_final_conf": 0.7606242895126343, "adv/mean_abs_reasoning": 0.5110421776771545, "adv/mean_abs_step_conf": 0.7742916941642761, "adv/ratio_final_to_reasoning": 1.4883786950226066, "adv/ratio_step_to_reasoning": 1.5151228763224054, "adv/std_final_conf": 0.9109428524971008, "adv/std_reasoning": 0.775321364402771, "adv/std_step_conf": 0.9362097382545471, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6389382778765558, "calib/avg_num_step_conf": 8.36328125, "calib/ece": 0.2739840637450198, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.44223107569721115, "calib/gap": 0.22977647955295888, "calib/mean_conf": 0.5523904382470118, "calib/mu_c": 0.6659055118110234, "calib/mu_w": 0.43612903225806454, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16019920318725092, "calib/std_conf": 0.4111017445483834, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893562271062271, "calib/step_q_c_n": 1092.0, "calib/step_q_gap": 0.007642347325378718, "calib/step_q_w": 0.8859199237368923, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 739.1953125, "completions/mean_terminated_length": 742.0941772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.16533333333333333, "grad_norm": 0.04050329327583313, "kl": 0.18646240234375, "learning_rate": 1.25e-06, "loss": 0.0259, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020515194162726402, "mask/share_reasoning": 0.8497653007507324, "mask/share_step_conf": 0.125813290476799, "num_tokens": 47089350.0, "reward": 0.5554911494255066, "reward_std": 0.3071201741695404, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6801988482475281, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.1354709416627884, "step": 155 }, { "adv/mean_abs_final_conf": 0.6814758777618408, "adv/mean_abs_reasoning": 0.46522659063339233, "adv/mean_abs_step_conf": 0.7735946178436279, "adv/ratio_final_to_reasoning": 1.4648257246732854, "adv/ratio_step_to_reasoning": 1.66283405424097, "adv/std_final_conf": 0.8751667737960815, "adv/std_reasoning": 0.7393137812614441, "adv/std_step_conf": 0.9359323978424072, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5959168472372698, "calib/avg_num_step_conf": 8.296875, "calib/ece": 0.3109756097560975, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4715447154471545, "calib/gap": 0.15321641386782242, "calib/mean_conf": 0.5764227642276422, "calib/mu_c": 0.6411971830985916, "calib/mu_w": 0.4879807692307692, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15508130081300808, "calib/std_conf": 0.4108352192399877, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.898228176318064, "calib/step_q_c_n": 1157.0, "calib/step_q_gap": 0.0323853634948994, "calib/step_q_w": 0.8658428128231646, "calib/step_q_w_n": 967.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 760.60546875, "completions/mean_terminated_length": 766.594482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.1664, "grad_norm": 0.031293194741010666, "kl": 0.1655731201171875, "learning_rate": 1.2222222222222223e-06, "loss": -0.0188, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019975952804088593, "mask/share_reasoning": 0.848551332950592, "mask/share_step_conf": 0.12366022914648056, "num_tokens": 47388825.0, "reward": 0.6052007675170898, "reward_std": 0.2788010835647583, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.634109377861023, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.27394840121269226, "step": 156 }, { "adv/mean_abs_final_conf": 0.6583760976791382, "adv/mean_abs_reasoning": 0.4844781756401062, "adv/mean_abs_step_conf": 0.7745144367218018, "adv/ratio_final_to_reasoning": 1.3589386081411678, "adv/ratio_step_to_reasoning": 1.5986570204085901, "adv/std_final_conf": 0.8895961046218872, "adv/std_reasoning": 0.7755122184753418, "adv/std_step_conf": 0.9363909363746643, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7374674702095604, "calib/avg_num_step_conf": 8.95703125, "calib/ece": 0.20248987854251008, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5587044534412956, "calib/gap": 0.37048691960005486, "calib/mean_conf": 0.6547165991902834, "calib/mu_c": 0.8017114093959732, "calib/mu_w": 0.4312244897959183, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12698380566801615, "calib/std_conf": 0.3979885161097442, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8932716807367613, "calib/step_q_c_n": 1303.0, "calib/step_q_gap": 0.04945349891857942, "calib/step_q_w": 0.8438181818181819, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 729.78125, "completions/mean_terminated_length": 753.3225708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.16746666666666668, "grad_norm": 0.02866934984922409, "kl": 0.171966552734375, "learning_rate": 1.1944444444444446e-06, "loss": -0.1051, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019679686054587364, "mask/share_reasoning": 0.8270054459571838, "mask/share_step_conf": 0.12206491827964783, "num_tokens": 47679377.0, "reward": 0.6636831164360046, "reward_std": 0.33132559061050415, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7471444606781006, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.27162790298461914, "step": 157 }, { "adv/mean_abs_final_conf": 0.6822168231010437, "adv/mean_abs_reasoning": 0.5764622688293457, "adv/mean_abs_step_conf": 0.7968577742576599, "adv/ratio_final_to_reasoning": 1.1834544253632762, "adv/ratio_step_to_reasoning": 1.3823242514655534, "adv/std_final_conf": 0.8623555898666382, "adv/std_reasoning": 0.7929839491844177, "adv/std_step_conf": 0.936363160610199, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6801173139158576, "calib/avg_num_step_conf": 8.29296875, "calib/ece": 0.25137651821862356, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.708502024291498, "calib/gap": 0.284259708737864, "calib/mean_conf": 0.7627125506072874, "calib/mu_c": 0.8812500000000001, "calib/mu_w": 0.5969902912621361, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2155465587044535, "calib/std_conf": 0.36784789691879916, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9059918032786884, "calib/step_q_c_n": 1220.0, "calib/step_q_gap": 0.04028859176152344, "calib/step_q_w": 0.865703211517165, "calib/step_q_w_n": 903.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 737.34765625, "completions/mean_terminated_length": 749.0516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 0.16853333333333334, "grad_norm": 0.0269920751452446, "kl": 0.1685638427734375, "learning_rate": 1.1666666666666668e-06, "loss": -0.0251, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020759008824825287, "mask/share_reasoning": 0.8345186710357666, "mask/share_step_conf": 0.12909731268882751, "num_tokens": 47973378.0, "reward": 0.6014918684959412, "reward_std": 0.4104700982570648, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.701915979385376, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.19559898972511292, "step": 158 }, { "adv/mean_abs_final_conf": 0.6884654760360718, "adv/mean_abs_reasoning": 0.526401162147522, "adv/mean_abs_step_conf": 0.7543092370033264, "adv/ratio_final_to_reasoning": 1.3078722570204582, "adv/ratio_step_to_reasoning": 1.43295511340823, "adv/std_final_conf": 0.8764867186546326, "adv/std_reasoning": 0.8098902106285095, "adv/std_step_conf": 0.9364614486694336, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6864791288566244, "calib/avg_num_step_conf": 8.44921875, "calib/ece": 0.27042168674698797, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5863453815261044, "calib/gap": 0.28616800622245275, "calib/mean_conf": 0.653714859437751, "calib/mu_c": 0.78703007518797, "calib/mu_w": 0.5008620689655172, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19500000000000003, "calib/std_conf": 0.4122793373772121, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9098790322580644, "calib/step_q_c_n": 1116.0, "calib/step_q_gap": 0.029883807807252527, "calib/step_q_w": 0.8799952244508119, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 741.390625, "completions/mean_terminated_length": 756.1593627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 0.1696, "grad_norm": 0.03104994259774685, "kl": 0.17242431640625, "learning_rate": 1.138888888888889e-06, "loss": -0.0501, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020054301247000694, "mask/share_reasoning": 0.8368874192237854, "mask/share_step_conf": 0.12352701276540756, "num_tokens": 48267958.0, "reward": 0.6443827152252197, "reward_std": 0.3411294221878052, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6898362636566162, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.301272988319397, "step": 159 }, { "adv/mean_abs_final_conf": 0.7009190320968628, "adv/mean_abs_reasoning": 0.5025119185447693, "adv/mean_abs_step_conf": 0.7787777781486511, "adv/ratio_final_to_reasoning": 1.3948306621794428, "adv/ratio_step_to_reasoning": 1.5497697654692921, "adv/std_final_conf": 0.8760209679603577, "adv/std_reasoning": 0.7579127550125122, "adv/std_step_conf": 0.9360190033912659, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6175496688741722, "calib/avg_num_step_conf": 9.06640625, "calib/ece": 0.29861924686192465, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5815899581589958, "calib/gap": 0.18110776640577975, "calib/mean_conf": 0.6735146443514644, "calib/mu_c": 0.7401986754966888, "calib/mu_w": 0.5590909090909091, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17016736401673635, "calib/std_conf": 0.39207445123159396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893084935897436, "calib/step_q_c_n": 1248.0, "calib/step_q_gap": 0.045927433567519826, "calib/step_q_w": 0.8471575023299162, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 707.03125, "completions/mean_terminated_length": 744.85595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.17066666666666666, "grad_norm": 0.04126443713903427, "kl": 0.1840667724609375, "learning_rate": 1.111111111111111e-06, "loss": -0.0968, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.019329741597175598, "mask/share_reasoning": 0.8126784563064575, "mask/share_step_conf": 0.11721055209636688, "num_tokens": 48553798.0, "reward": 0.6182776689529419, "reward_std": 0.3836175203323364, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6499402523040771, "rewards/format_reward_step": 0.93359375, "rewards/step_correlation_reward": 0.28192758560180664, "step": 160 }, { "adv/mean_abs_final_conf": 0.572830855846405, "adv/mean_abs_reasoning": 0.3294076919555664, "adv/mean_abs_step_conf": 0.7416273355484009, "adv/ratio_final_to_reasoning": 1.7389723125338368, "adv/ratio_step_to_reasoning": 2.251396532806036, "adv/std_final_conf": 0.8279606103897095, "adv/std_reasoning": 0.6402148008346558, "adv/std_step_conf": 0.9353938102722168, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6893785883147585, "calib/avg_num_step_conf": 8.98046875, "calib/ece": 0.22601593625497995, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6334661354581673, "calib/gap": 0.27145558932792946, "calib/mean_conf": 0.7069721115537849, "calib/mu_c": 0.7751063829787233, "calib/mu_w": 0.5036507936507938, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09199203187250984, "calib/std_conf": 0.3878477686573863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9108514605344934, "calib/step_q_c_n": 1609.0, "calib/step_q_gap": 0.025170301114203375, "calib/step_q_w": 0.88568115942029, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 729.03515625, "completions/mean_terminated_length": 734.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.17173333333333332, "grad_norm": 0.04238247498869896, "kl": 0.174163818359375, "learning_rate": 1.0833333333333335e-06, "loss": 0.028, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.021196896210312843, "mask/share_reasoning": 0.8361594080924988, "mask/share_step_conf": 0.1348312497138977, "num_tokens": 48844351.0, "reward": 0.7657527327537537, "reward_std": 0.283169686794281, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7430897951126099, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.4462280869483948, "step": 161 }, { "adv/mean_abs_final_conf": 0.5648688673973083, "adv/mean_abs_reasoning": 0.4343217611312866, "adv/mean_abs_step_conf": 0.7546055316925049, "adv/ratio_final_to_reasoning": 1.3005769407592727, "adv/ratio_step_to_reasoning": 1.7374343153494514, "adv/std_final_conf": 0.8103326559066772, "adv/std_reasoning": 0.7014553546905518, "adv/std_step_conf": 0.9359780550003052, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7571895424836601, "calib/avg_num_step_conf": 8.17578125, "calib/ece": 0.18587250996015942, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6414342629482072, "calib/gap": 0.37191169208424096, "calib/mean_conf": 0.7117689243027888, "calib/mu_c": 0.8317882352941176, "calib/mu_w": 0.45987654320987664, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11017529880478091, "calib/std_conf": 0.38622836894578855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9041127063890884, "calib/step_q_c_n": 1393.0, "calib/step_q_gap": 0.03745556353194557, "calib/step_q_w": 0.8666571428571428, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 713.2265625, "completions/mean_terminated_length": 718.842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.1728, "grad_norm": 0.022894565016031265, "kl": 0.178131103515625, "learning_rate": 1.0555555555555557e-06, "loss": -0.0356, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.021328045055270195, "mask/share_reasoning": 0.8429559469223022, "mask/share_step_conf": 0.127903550863266, "num_tokens": 49131081.0, "reward": 0.7984813451766968, "reward_std": 0.2730226218700409, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.778145968914032, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.48991042375564575, "step": 162 }, { "adv/mean_abs_final_conf": 0.6256484985351562, "adv/mean_abs_reasoning": 0.48577702045440674, "adv/mean_abs_step_conf": 0.774186372756958, "adv/ratio_final_to_reasoning": 1.287933500744664, "adv/ratio_step_to_reasoning": 1.5937072775339738, "adv/std_final_conf": 0.8259115815162659, "adv/std_reasoning": 0.7394855618476868, "adv/std_step_conf": 0.9359018206596375, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7147538697531725, "calib/avg_num_step_conf": 8.875, "calib/ece": 0.22711522633744846, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5679012345679012, "calib/gap": 0.30850432296750807, "calib/mean_conf": 0.6559218106995885, "calib/mu_c": 0.7841478873239438, "calib/mu_w": 0.47564356435643573, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1493374485596707, "calib/std_conf": 0.4039821305301361, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8987555953446733, "calib/step_q_c_n": 1117.0, "calib/step_q_gap": 0.12455646114553909, "calib/step_q_w": 0.7741991341991342, "calib/step_q_w_n": 1155.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 771.44921875, "completions/mean_terminated_length": 789.9640502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.17386666666666667, "grad_norm": 0.03064306080341339, "kl": 0.16400146484375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0444, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01990315318107605, "mask/share_reasoning": 0.8389712572097778, "mask/share_step_conf": 0.11768805980682373, "num_tokens": 49433404.0, "reward": 0.6272368431091309, "reward_std": 0.3007856011390686, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7011457085609436, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.2525468170642853, "step": 163 }, { "adv/mean_abs_final_conf": 0.648464024066925, "adv/mean_abs_reasoning": 0.4645434021949768, "adv/mean_abs_step_conf": 0.74281245470047, "adv/ratio_final_to_reasoning": 1.3959169821440143, "adv/ratio_step_to_reasoning": 1.5990162624001683, "adv/std_final_conf": 0.8585214018821716, "adv/std_reasoning": 0.7393715381622314, "adv/std_step_conf": 0.9360941052436829, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7796841684434969, "calib/avg_num_step_conf": 8.609375, "calib/ece": 0.15414634146341474, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5203252032520326, "calib/gap": 0.43533448827292126, "calib/mean_conf": 0.6191869918699187, "calib/mu_c": 0.8173880597014928, "calib/mu_w": 0.3820535714285715, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.114308943089431, "calib/std_conf": 0.4069240026046992, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8863724624889673, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": 0.0132912299959701, "calib/step_q_w": 0.8730812324929972, "calib/step_q_w_n": 1071.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 803.91015625, "completions/mean_terminated_length": 823.2040405273438, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.17493333333333333, "grad_norm": 0.0358455553650856, "kl": 0.165863037109375, "learning_rate": 1.0000000000000002e-06, "loss": -0.0161, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01811538077890873, "mask/share_reasoning": 0.8500458002090454, "mask/share_step_conf": 0.10840128362178802, "num_tokens": 49745341.0, "reward": 0.6460665464401245, "reward_std": 0.3104490339756012, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7656679749488831, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.22959008812904358, "step": 164 }, { "adv/mean_abs_final_conf": 0.5592621564865112, "adv/mean_abs_reasoning": 0.4206353425979614, "adv/mean_abs_step_conf": 0.7922811508178711, "adv/ratio_final_to_reasoning": 1.329565302412184, "adv/ratio_step_to_reasoning": 1.883534431330761, "adv/std_final_conf": 0.8088847398757935, "adv/std_reasoning": 0.6817131042480469, "adv/std_step_conf": 0.9359111189842224, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.760966306420852, "calib/avg_num_step_conf": 8.69140625, "calib/ece": 0.2168525896414342, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5976095617529881, "calib/gap": 0.3785435473617293, "calib/mean_conf": 0.6782071713147411, "calib/mu_c": 0.8606923076923078, "calib/mu_w": 0.48214876033057846, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18856573705179278, "calib/std_conf": 0.3996058477726149, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9013636363636365, "calib/step_q_c_n": 1078.0, "calib/step_q_gap": 0.020980026947769015, "calib/step_q_w": 0.8803836094158675, "calib/step_q_w_n": 1147.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 773.25, "completions/mean_terminated_length": 782.4190063476562, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.176, "grad_norm": 0.027194950729608536, "kl": 0.169952392578125, "learning_rate": 9.722222222222224e-07, "loss": 0.0021, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019431117922067642, "mask/share_reasoning": 0.8503310680389404, "mask/share_step_conf": 0.11851904541254044, "num_tokens": 50048869.0, "reward": 0.5861892700195312, "reward_std": 0.3149217367172241, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7353046536445618, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.14098012447357178, "step": 165 }, { "adv/mean_abs_final_conf": 0.6314215064048767, "adv/mean_abs_reasoning": 0.4578407108783722, "adv/mean_abs_step_conf": 0.7630418539047241, "adv/ratio_final_to_reasoning": 1.3791292285770917, "adv/ratio_step_to_reasoning": 1.6666098836881944, "adv/std_final_conf": 0.8481122851371765, "adv/std_reasoning": 0.7393897771835327, "adv/std_step_conf": 0.9354910850524902, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7865882352941176, "calib/avg_num_step_conf": 8.30859375, "calib/ece": 0.1518734693877551, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6285714285714286, "calib/gap": 0.46196274509803925, "calib/mean_conf": 0.6988122448979591, "calib/mu_c": 0.8402294117647059, "calib/mu_w": 0.37826666666666664, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07840408163265303, "calib/std_conf": 0.39420472958813463, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8941734906315059, "calib/step_q_c_n": 1441.0, "calib/step_q_gap": 0.03326970054404221, "calib/step_q_w": 0.8609037900874636, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 772.12109375, "completions/mean_terminated_length": 787.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.17706666666666668, "grad_norm": 0.026412660256028175, "kl": 0.166351318359375, "learning_rate": 9.444444444444445e-07, "loss": 0.0237, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019667625427246094, "mask/share_reasoning": 0.838019847869873, "mask/share_step_conf": 0.12278129160404205, "num_tokens": 50352716.0, "reward": 0.7692021131515503, "reward_std": 0.28766733407974243, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7928230166435242, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.42136237025260925, "step": 166 }, { "adv/mean_abs_final_conf": 0.5876627564430237, "adv/mean_abs_reasoning": 0.46079394221305847, "adv/mean_abs_step_conf": 0.7655038833618164, "adv/ratio_final_to_reasoning": 1.2753265670565272, "adv/ratio_step_to_reasoning": 1.661271586352297, "adv/std_final_conf": 0.8277955651283264, "adv/std_reasoning": 0.7207027673721313, "adv/std_step_conf": 0.9362152218818665, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6482921318447634, "calib/avg_num_step_conf": 8.40625, "calib/ece": 0.27191235059760954, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7250996015936255, "calib/gap": 0.24071703880914408, "calib/mean_conf": 0.7774900398406376, "calib/mu_c": 0.8724342105263159, "calib/mu_w": 0.6317171717171718, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22191235059760955, "calib/std_conf": 0.36075572915192167, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8928288973384032, "calib/step_q_c_n": 1315.0, "calib/step_q_gap": -0.00788794853973307, "calib/step_q_w": 0.9007168458781363, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 762.58203125, "completions/mean_terminated_length": 765.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 362.0, "epoch": 0.17813333333333334, "grad_norm": 0.0374019555747509, "kl": 0.1687774658203125, "learning_rate": 9.166666666666666e-07, "loss": 0.03, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019959034398198128, "mask/share_reasoning": 0.8518633842468262, "mask/share_step_conf": 0.12427132576704025, "num_tokens": 50653545.0, "reward": 0.6505559682846069, "reward_std": 0.3292909264564514, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7024472951889038, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.28382089734077454, "step": 167 }, { "adv/mean_abs_final_conf": 0.6799878478050232, "adv/mean_abs_reasoning": 0.5563756227493286, "adv/mean_abs_step_conf": 0.7683614492416382, "adv/ratio_final_to_reasoning": 1.2221740493317539, "adv/ratio_step_to_reasoning": 1.3810120677911484, "adv/std_final_conf": 0.8610654473304749, "adv/std_reasoning": 0.7931175827980042, "adv/std_step_conf": 0.9361603260040283, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7151613357756799, "calib/avg_num_step_conf": 9.99609375, "calib/ece": 0.20934693877551025, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6244897959183674, "calib/gap": 0.34080597435536125, "calib/mean_conf": 0.7038775510204082, "calib/mu_c": 0.834635761589404, "calib/mu_w": 0.49382978723404275, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14844897959183678, "calib/std_conf": 0.3908964909524009, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8802591973244147, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": 0.03288575638530966, "calib/step_q_w": 0.847373440939105, "calib/step_q_w_n": 1363.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 815.10546875, "completions/mean_terminated_length": 831.3426513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.1792, "grad_norm": 0.025498026981949806, "kl": 0.1545257568359375, "learning_rate": 8.88888888888889e-07, "loss": -0.0058, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018487967550754547, "mask/share_reasoning": 0.8505588173866272, "mask/share_step_conf": 0.11142198741436005, "num_tokens": 50966884.0, "reward": 0.6711308360099792, "reward_std": 0.3440842032432556, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7275401949882507, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.306127667427063, "step": 168 }, { "adv/mean_abs_final_conf": 0.598463237285614, "adv/mean_abs_reasoning": 0.4369748830795288, "adv/mean_abs_step_conf": 0.731543779373169, "adv/ratio_final_to_reasoning": 1.369559808719474, "adv/ratio_step_to_reasoning": 1.6741094458740984, "adv/std_final_conf": 0.8424903154373169, "adv/std_reasoning": 0.7204640507698059, "adv/std_step_conf": 0.9356883764266968, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6809899198759369, "calib/avg_num_step_conf": 8.72265625, "calib/ece": 0.2695238095238095, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6626984126984127, "calib/gap": 0.25095115016800207, "calib/mean_conf": 0.744920634920635, "calib/mu_c": 0.8504794520547946, "calib/mu_w": 0.5995283018867925, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21753968253968256, "calib/std_conf": 0.36775972580396354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8915513126491645, "calib/step_q_c_n": 1257.0, "calib/step_q_gap": -0.011676146367228801, "calib/step_q_w": 0.9032274590163933, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 759.2265625, "completions/mean_terminated_length": 768.2293090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.18026666666666666, "grad_norm": 0.12461809068918228, "kl": 0.1781768798828125, "learning_rate": 8.611111111111112e-07, "loss": -0.0252, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0193403922021389, "mask/share_reasoning": 0.8507750034332275, "mask/share_step_conf": 0.11816590279340744, "num_tokens": 51265430.0, "reward": 0.6641005277633667, "reward_std": 0.30344459414482117, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7047703266143799, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.31171196699142456, "step": 169 }, { "adv/mean_abs_final_conf": 0.5729464292526245, "adv/mean_abs_reasoning": 0.5114200711250305, "adv/mean_abs_step_conf": 0.7788619995117188, "adv/ratio_final_to_reasoning": 1.1203049344392122, "adv/ratio_step_to_reasoning": 1.5229398365190576, "adv/std_final_conf": 0.7941260933876038, "adv/std_reasoning": 0.7753347158432007, "adv/std_step_conf": 0.9359708428382874, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7183445945945945, "calib/avg_num_step_conf": 9.09375, "calib/ece": 0.22644354838709668, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6854838709677419, "calib/gap": 0.3284681081081082, "calib/mean_conf": 0.7474112903225807, "calib/mu_c": 0.8798581081081083, "calib/mu_w": 0.55139, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18854032258064507, "calib/std_conf": 0.37516738225922347, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8987419871794873, "calib/step_q_c_n": 1248.0, "calib/step_q_gap": 0.0009179131054131151, "calib/step_q_w": 0.8978240740740742, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 784.21484375, "completions/mean_terminated_length": 793.5138549804688, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.18133333333333335, "grad_norm": 0.02952180802822113, "kl": 0.159332275390625, "learning_rate": 8.333333333333333e-07, "loss": 0.0433, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018694642931222916, "mask/share_reasoning": 0.8499318361282349, "mask/share_step_conf": 0.1196548119187355, "num_tokens": 51570341.0, "reward": 0.676643431186676, "reward_std": 0.3154790997505188, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7304421663284302, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.31346967816352844, "step": 170 }, { "adv/mean_abs_final_conf": 0.6785448789596558, "adv/mean_abs_reasoning": 0.5094852447509766, "adv/mean_abs_step_conf": 0.7583144307136536, "adv/ratio_final_to_reasoning": 1.331824397174272, "adv/ratio_step_to_reasoning": 1.488393311732312, "adv/std_final_conf": 0.8765658736228943, "adv/std_reasoning": 0.7927893400192261, "adv/std_step_conf": 0.9364736676216125, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6562177169421488, "calib/avg_num_step_conf": 8.03125, "calib/ece": 0.32253012048192764, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6345381526104418, "calib/gap": 0.2068220557851238, "calib/mean_conf": 0.711285140562249, "calib/mu_c": 0.8176033057851239, "calib/mu_w": 0.61078125, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2739357429718875, "calib/std_conf": 0.3855314422871999, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893192118226601, "calib/step_q_c_n": 1015.0, "calib/step_q_gap": 0.04358597029192279, "calib/step_q_w": 0.8496061479346783, "calib/step_q_w_n": 1041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 748.3671875, "completions/mean_terminated_length": 757.2411499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.1824, "grad_norm": 0.047669488936662674, "kl": 0.16168212890625, "learning_rate": 8.055555555555557e-07, "loss": -0.0245, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020132333040237427, "mask/share_reasoning": 0.8459810018539429, "mask/share_step_conf": 0.12216789275407791, "num_tokens": 51868819.0, "reward": 0.5441136360168457, "reward_std": 0.3625720143318176, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6362277269363403, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.1629369854927063, "step": 171 }, { "adv/mean_abs_final_conf": 0.60759437084198, "adv/mean_abs_reasoning": 0.531531572341919, "adv/mean_abs_step_conf": 0.7559362649917603, "adv/ratio_final_to_reasoning": 1.1431011861909344, "adv/ratio_step_to_reasoning": 1.4221850673161673, "adv/std_final_conf": 0.8296145796775818, "adv/std_reasoning": 0.7928065061569214, "adv/std_step_conf": 0.9353254437446594, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6312409812409812, "calib/avg_num_step_conf": 8.15625, "calib/ece": 0.28329317269076304, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7751004016064257, "calib/gap": 0.14640909090909093, "calib/mean_conf": 0.8395180722891568, "calib/mu_c": 0.888909090909091, "calib/mu_w": 0.7425, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23008032128514053, "calib/std_conf": 0.3036547273606863, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8989647577092512, "calib/step_q_c_n": 1362.0, "calib/step_q_gap": 0.014942719141758043, "calib/step_q_w": 0.8840220385674932, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 740.01171875, "completions/mean_terminated_length": 748.78662109375, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.18346666666666667, "grad_norm": 0.036135055124759674, "kl": 0.1663055419921875, "learning_rate": 7.777777777777779e-07, "loss": 0.0072, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02015288546681404, "mask/share_reasoning": 0.8440628051757812, "mask/share_step_conf": 0.12406553328037262, "num_tokens": 52161614.0, "reward": 0.7183195352554321, "reward_std": 0.36564695835113525, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6987804174423218, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.4144209623336792, "step": 172 }, { "adv/mean_abs_final_conf": 0.6681346893310547, "adv/mean_abs_reasoning": 0.5042810440063477, "adv/mean_abs_step_conf": 0.7563927173614502, "adv/ratio_final_to_reasoning": 1.3249252520438672, "adv/ratio_step_to_reasoning": 1.4999427925193418, "adv/std_final_conf": 0.8785695433616638, "adv/std_reasoning": 0.7753050923347473, "adv/std_step_conf": 0.936163604259491, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6136217605482972, "calib/avg_num_step_conf": 9.46875, "calib/ece": 0.26256854838709687, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7782258064516129, "calib/gap": 0.21532512315270913, "calib/mean_conf": 0.822891129032258, "calib/mu_c": 0.8984285714285714, "calib/mu_w": 0.6831034482758622, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2181330645161291, "calib/std_conf": 0.3315241565450681, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8894685314685316, "calib/step_q_c_n": 1430.0, "calib/step_q_gap": -0.00946507014112652, "calib/step_q_w": 0.8989336016096581, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 767.86328125, "completions/mean_terminated_length": 773.909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.18453333333333333, "grad_norm": 0.03230111673474312, "kl": 0.1728515625, "learning_rate": 7.5e-07, "loss": -0.0651, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01997010037302971, "mask/share_reasoning": 0.8442978858947754, "mask/share_step_conf": 0.1279194951057434, "num_tokens": 52461347.0, "reward": 0.6749608516693115, "reward_std": 0.3358026146888733, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7034523487091064, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.32850050926208496, "step": 173 }, { "adv/mean_abs_final_conf": 0.7172695398330688, "adv/mean_abs_reasoning": 0.5680016279220581, "adv/mean_abs_step_conf": 0.8021489381790161, "adv/ratio_final_to_reasoning": 1.262794866375794, "adv/ratio_step_to_reasoning": 1.412229998553962, "adv/std_final_conf": 0.8888024091720581, "adv/std_reasoning": 0.793002188205719, "adv/std_step_conf": 0.9364355802536011, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6110557106767718, "calib/avg_num_step_conf": 8.75390625, "calib/ece": 0.3080566801619432, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.1802566148118605, "calib/mean_conf": 0.7419028340080971, "calib/mu_c": 0.8214492753623189, "calib/mu_w": 0.6411926605504584, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24562753036437238, "calib/std_conf": 0.3722543516195086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8913099041533546, "calib/step_q_c_n": 1252.0, "calib/step_q_gap": 0.01091556643849112, "calib/step_q_w": 0.8803943377148635, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 818.765625, "completions/mean_terminated_length": 831.761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.1856, "grad_norm": 0.02354642190039158, "kl": 0.1643524169921875, "learning_rate": 7.222222222222222e-07, "loss": -0.0409, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0181894451379776, "mask/share_reasoning": 0.848423182964325, "mask/share_step_conf": 0.11776234954595566, "num_tokens": 52775183.0, "reward": 0.5617752075195312, "reward_std": 0.4311276376247406, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6466355323791504, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.17613360285758972, "step": 174 }, { "adv/mean_abs_final_conf": 0.6367019414901733, "adv/mean_abs_reasoning": 0.5002825856208801, "adv/mean_abs_step_conf": 0.7642124891281128, "adv/ratio_final_to_reasoning": 1.272684598245587, "adv/ratio_step_to_reasoning": 1.5275616443448259, "adv/std_final_conf": 0.8446736335754395, "adv/std_reasoning": 0.7754291892051697, "adv/std_step_conf": 0.9358320236206055, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6698145604395604, "calib/avg_num_step_conf": 8.6171875, "calib/ece": 0.3659795081967213, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6434426229508197, "calib/gap": 0.22436950549450552, "calib/mean_conf": 0.736061475409836, "calib/mu_c": 0.864798076923077, "calib/mu_w": 0.6404285714285715, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.337905737704918, "calib/std_conf": 0.37361490692930116, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8871815144766148, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.05556071936958107, "calib/step_q_w": 0.8316207951070337, "calib/step_q_w_n": 1308.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 799.3125, "completions/mean_terminated_length": 818.4960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.18666666666666668, "grad_norm": 0.0368288978934288, "kl": 0.1604461669921875, "learning_rate": 6.944444444444446e-07, "loss": -0.045, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019019050523638725, "mask/share_reasoning": 0.8398895263671875, "mask/share_step_conf": 0.11765392124652863, "num_tokens": 53085631.0, "reward": 0.4227388799190521, "reward_std": 0.3410660922527313, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.5925886631011963, "rewards/format_reward_step": 0.94140625, "rewards/step_correlation_reward": -0.016642220318317413, "step": 175 }, { "adv/mean_abs_final_conf": 0.6359574198722839, "adv/mean_abs_reasoning": 0.6153806447982788, "adv/mean_abs_step_conf": 0.7664022445678711, "adv/ratio_final_to_reasoning": 1.0334374752406297, "adv/ratio_step_to_reasoning": 1.2454116830715354, "adv/std_final_conf": 0.858352780342102, "adv/std_reasoning": 0.8266722559928894, "adv/std_step_conf": 0.9365108609199524, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6957075788061703, "calib/avg_num_step_conf": 8.453125, "calib/ece": 0.2355263157894737, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7206477732793523, "calib/gap": 0.32593259557344045, "calib/mean_conf": 0.7736639676113359, "calib/mu_c": 0.9122183098591548, "calib/mu_w": 0.5862857142857143, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21714574898785427, "calib/std_conf": 0.36504289088194597, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8983413848631242, "calib/step_q_c_n": 1242.0, "calib/step_q_gap": 0.04184463865921961, "calib/step_q_w": 0.8564967462039046, "calib/step_q_w_n": 922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 771.73046875, "completions/mean_terminated_length": 780.8814697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.18773333333333334, "grad_norm": 0.025098109617829323, "kl": 0.1585693359375, "learning_rate": 6.666666666666667e-07, "loss": -0.0358, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019676584750413895, "mask/share_reasoning": 0.8429697155952454, "mask/share_step_conf": 0.12563498318195343, "num_tokens": 53387258.0, "reward": 0.6907268762588501, "reward_std": 0.4108719229698181, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7160639762878418, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.36148351430892944, "step": 176 }, { "adv/mean_abs_final_conf": 0.6260363459587097, "adv/mean_abs_reasoning": 0.42240333557128906, "adv/mean_abs_step_conf": 0.7517305612564087, "adv/ratio_final_to_reasoning": 1.4820819184867764, "adv/ratio_step_to_reasoning": 1.779651101096807, "adv/std_final_conf": 0.8498795032501221, "adv/std_reasoning": 0.7014126181602478, "adv/std_step_conf": 0.9360769391059875, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7296880131362891, "calib/avg_num_step_conf": 8.5546875, "calib/ece": 0.21883999999999992, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.712, "calib/gap": 0.35711986863710987, "calib/mean_conf": 0.78132, "calib/mu_c": 0.9313103448275861, "calib/mu_w": 0.5741904761904763, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21007999999999993, "calib/std_conf": 0.35204240880893883, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9011712439418418, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.037274185118312264, "calib/step_q_w": 0.8638970588235295, "calib/step_q_w_n": 952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 751.49609375, "completions/mean_terminated_length": 763.4246215820312, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.1888, "grad_norm": 0.03494914993643761, "kl": 0.1652984619140625, "learning_rate": 6.388888888888889e-07, "loss": -0.0653, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020012080669403076, "mask/share_reasoning": 0.8428300619125366, "mask/share_step_conf": 0.1215328723192215, "num_tokens": 53683473.0, "reward": 0.6597840785980225, "reward_std": 0.35069000720977783, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7479737997055054, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2630005180835724, "step": 177 }, { "adv/mean_abs_final_conf": 0.6410762667655945, "adv/mean_abs_reasoning": 0.5759428143501282, "adv/mean_abs_step_conf": 0.756990909576416, "adv/ratio_final_to_reasoning": 1.1130901381050484, "adv/ratio_step_to_reasoning": 1.314350818719, "adv/std_final_conf": 0.860005259513855, "adv/std_reasoning": 0.8267216086387634, "adv/std_step_conf": 0.9359890222549438, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7718120805369126, "calib/avg_num_step_conf": 8.94921875, "calib/ece": 0.19368852459016384, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7377049180327869, "calib/gap": 0.3920593429883431, "calib/mean_conf": 0.788360655737705, "calib/mu_c": 0.9410067114093958, "calib/mu_w": 0.5489473684210527, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1856967213114753, "calib/std_conf": 0.35428491973786913, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9087252573238321, "calib/step_q_c_n": 1263.0, "calib/step_q_gap": 0.028744712576750264, "calib/step_q_w": 0.8799805447470819, "calib/step_q_w_n": 1028.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2829.0, "completions/max_terminated_length": 2829.0, "completions/mean_length": 737.47265625, "completions/mean_terminated_length": 758.2047729492188, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.18986666666666666, "grad_norm": 0.029399629682302475, "kl": 0.16845703125, "learning_rate": 6.111111111111112e-07, "loss": -0.0332, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01988956332206726, "mask/share_reasoning": 0.8306618928909302, "mask/share_step_conf": 0.12210480123758316, "num_tokens": 53978338.0, "reward": 0.6872397065162659, "reward_std": 0.40832090377807617, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7544710636138916, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.31297701597213745, "step": 178 }, { "adv/mean_abs_final_conf": 0.6690976023674011, "adv/mean_abs_reasoning": 0.5811420679092407, "adv/mean_abs_step_conf": 0.7728196978569031, "adv/ratio_final_to_reasoning": 1.151349453627743, "adv/ratio_step_to_reasoning": 1.329829211361786, "adv/std_final_conf": 0.8573887348175049, "adv/std_reasoning": 0.8100538849830627, "adv/std_step_conf": 0.9364103078842163, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6923720541068191, "calib/avg_num_step_conf": 8.75390625, "calib/ece": 0.25281893004115213, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7078189300411523, "calib/gap": 0.28196520708408834, "calib/mean_conf": 0.7846707818930042, "calib/mu_c": 0.9018661971830984, "calib/mu_w": 0.61990099009901, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22656378600823035, "calib/std_conf": 0.3496295727158898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9004632587859426, "calib/step_q_c_n": 1252.0, "calib/step_q_gap": 0.0475006703127373, "calib/step_q_w": 0.8529625884732053, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 784.828125, "completions/mean_terminated_length": 803.6640625, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.19093333333333334, "grad_norm": 0.023182764649391174, "kl": 0.1683807373046875, "learning_rate": 5.833333333333334e-07, "loss": -0.0486, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018749622628092766, "mask/share_reasoning": 0.8373773097991943, "mask/share_step_conf": 0.12043555825948715, "num_tokens": 54285518.0, "reward": 0.6336249113082886, "reward_std": 0.3934309780597687, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6945639252662659, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.2719046473503113, "step": 179 }, { "adv/mean_abs_final_conf": 0.5988168716430664, "adv/mean_abs_reasoning": 0.47128474712371826, "adv/mean_abs_step_conf": 0.7732551097869873, "adv/ratio_final_to_reasoning": 1.2706052451255532, "adv/ratio_step_to_reasoning": 1.6407386712729704, "adv/std_final_conf": 0.8279719948768616, "adv/std_reasoning": 0.7206318974494934, "adv/std_step_conf": 0.9363507628440857, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6941620879120879, "calib/avg_num_step_conf": 8.71875, "calib/ece": 0.21770916334661342, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7330677290836654, "calib/gap": 0.2895188873626372, "calib/mean_conf": 0.807191235059761, "calib/mu_c": 0.91215625, "calib/mu_w": 0.6226373626373628, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1937250996015935, "calib/std_conf": 0.32948677234491136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8736913767019667, "calib/step_q_c_n": 1322.0, "calib/step_q_gap": 0.027339728350318282, "calib/step_q_w": 0.8463516483516484, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 817.60546875, "completions/mean_terminated_length": 820.8118286132812, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.192, "grad_norm": 0.03066716343164444, "kl": 0.1658477783203125, "learning_rate": 5.555555555555555e-07, "loss": 0.0516, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018639039248228073, "mask/share_reasoning": 0.8629419207572937, "mask/share_step_conf": 0.11451278626918793, "num_tokens": 54598681.0, "reward": 0.7222050428390503, "reward_std": 0.339335173368454, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.750390887260437, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3729255199432373, "step": 180 }, { "adv/mean_abs_final_conf": 0.6700544357299805, "adv/mean_abs_reasoning": 0.5483631491661072, "adv/mean_abs_step_conf": 0.8035197257995605, "adv/ratio_final_to_reasoning": 1.2219173311498568, "adv/ratio_step_to_reasoning": 1.4653058416151206, "adv/std_final_conf": 0.8462985754013062, "adv/std_reasoning": 0.7755341529846191, "adv/std_step_conf": 0.9353812336921692, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7128220140515222, "calib/avg_num_step_conf": 8.2109375, "calib/ece": 0.2820362903225807, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6612903225806451, "calib/gap": 0.3125117096018738, "calib/mean_conf": 0.7534072580645162, "calib/mu_c": 0.9071428571428573, "calib/mu_w": 0.5946311475409835, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2636895161290323, "calib/std_conf": 0.3576950509350801, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9033270142180094, "calib/step_q_c_n": 1055.0, "calib/step_q_gap": 0.02008919186843927, "calib/step_q_w": 0.8832378223495702, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 729.48046875, "completions/mean_terminated_length": 746.988037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 395.0, "epoch": 0.19306666666666666, "grad_norm": 0.043537769466638565, "kl": 0.1852874755859375, "learning_rate": 5.277777777777779e-07, "loss": -0.1066, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01987295411527157, "mask/share_reasoning": 0.8370122909545898, "mask/share_step_conf": 0.11967720091342926, "num_tokens": 54891692.0, "reward": 0.5772460103034973, "reward_std": 0.40024036169052124, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6956995129585266, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.16660496592521667, "step": 181 }, { "adv/mean_abs_final_conf": 0.6767345666885376, "adv/mean_abs_reasoning": 0.4965467154979706, "adv/mean_abs_step_conf": 0.7697621583938599, "adv/ratio_final_to_reasoning": 1.3628819717594194, "adv/ratio_step_to_reasoning": 1.5502310948163063, "adv/std_final_conf": 0.8484857082366943, "adv/std_reasoning": 0.7393876314163208, "adv/std_step_conf": 0.9362363815307617, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6834075015893197, "calib/avg_num_step_conf": 8.13671875, "calib/ece": 0.27547430830039515, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7391304347826086, "calib/gap": 0.2484335664335665, "calib/mean_conf": 0.8079644268774704, "calib/mu_c": 0.9159790209790208, "calib/mu_w": 0.6675454545454543, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25911067193675874, "calib/std_conf": 0.33439330160378195, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8946246764452115, "calib/step_q_c_n": 1159.0, "calib/step_q_gap": 0.011410390730925646, "calib/step_q_w": 0.8832142857142858, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 766.30078125, "completions/mean_terminated_length": 769.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.19413333333333332, "grad_norm": 0.029424257576465607, "kl": 0.160125732421875, "learning_rate": 5.000000000000001e-07, "loss": 0.0336, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01990341767668724, "mask/share_reasoning": 0.8551667332649231, "mask/share_step_conf": 0.12102358043193817, "num_tokens": 55194025.0, "reward": 0.5911649465560913, "reward_std": 0.4348578155040741, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6973428726196289, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.17561198770999908, "step": 182 }, { "adv/mean_abs_final_conf": 0.6397321224212646, "adv/mean_abs_reasoning": 0.5764783620834351, "adv/mean_abs_step_conf": 0.7763506174087524, "adv/ratio_final_to_reasoning": 1.1097244311290815, "adv/ratio_step_to_reasoning": 1.3467125021014916, "adv/std_final_conf": 0.8472661375999451, "adv/std_reasoning": 0.8100085854530334, "adv/std_step_conf": 0.9365969896316528, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6715827832783278, "calib/avg_num_step_conf": 8.19921875, "calib/ece": 0.24405714285714283, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6448979591836734, "calib/gap": 0.26941996699669957, "calib/mean_conf": 0.7220163265306123, "calib/mu_c": 0.8330833333333333, "calib/mu_w": 0.5636633663366337, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18915918367346934, "calib/std_conf": 0.3782347286243372, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.888424599831508, "calib/step_q_c_n": 1187.0, "calib/step_q_gap": 0.015278766498174612, "calib/step_q_w": 0.8731458333333334, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 789.6171875, "completions/mean_terminated_length": 805.3466186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.1952, "grad_norm": 0.025317171588540077, "kl": 0.1699066162109375, "learning_rate": 4.7222222222222226e-07, "loss": -0.0435, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019160984084010124, "mask/share_reasoning": 0.847537636756897, "mask/share_step_conf": 0.11377011984586716, "num_tokens": 55502847.0, "reward": 0.6211838126182556, "reward_std": 0.3905571401119232, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6959280371665955, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.24253329634666443, "step": 183 }, { "adv/mean_abs_final_conf": 0.6272495985031128, "adv/mean_abs_reasoning": 0.5777990818023682, "adv/mean_abs_step_conf": 0.7672903537750244, "adv/ratio_final_to_reasoning": 1.085584277057849, "adv/ratio_step_to_reasoning": 1.3279535706106753, "adv/std_final_conf": 0.8438796401023865, "adv/std_reasoning": 0.809932291507721, "adv/std_step_conf": 0.936083197593689, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7013080399945213, "calib/avg_num_step_conf": 8.265625, "calib/ece": 0.2151336032388664, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.708502024291498, "calib/gap": 0.31017607177099005, "calib/mean_conf": 0.7942429149797571, "calib/mu_c": 0.9173087248322146, "calib/mu_w": 0.6071326530612245, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20306882591093117, "calib/std_conf": 0.33521408607401043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8948870292887028, "calib/step_q_c_n": 1195.0, "calib/step_q_gap": 0.010598212784902472, "calib/step_q_w": 0.8842888165038003, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 781.8359375, "completions/mean_terminated_length": 800.6000366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.19626666666666667, "grad_norm": 0.03133862093091011, "kl": 0.1655120849609375, "learning_rate": 4.444444444444445e-07, "loss": -0.0385, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01901405304670334, "mask/share_reasoning": 0.8426245450973511, "mask/share_step_conf": 0.11492390930652618, "num_tokens": 55808277.0, "reward": 0.6738928556442261, "reward_std": 0.37431710958480835, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7334769368171692, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.305714875459671, "step": 184 }, { "adv/mean_abs_final_conf": 0.5357584953308105, "adv/mean_abs_reasoning": 0.3198157846927643, "adv/mean_abs_step_conf": 0.7774848937988281, "adv/ratio_final_to_reasoning": 1.675209670609269, "adv/ratio_step_to_reasoning": 2.4310397766818492, "adv/std_final_conf": 0.7776016592979431, "adv/std_reasoning": 0.5961856842041016, "adv/std_step_conf": 0.9354940056800842, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7680244534520566, "calib/avg_num_step_conf": 8.53515625, "calib/ece": 0.1909039999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.72, "calib/gap": 0.4208968037743369, "calib/mean_conf": 0.786904, "calib/mu_c": 0.9569463087248321, "calib/mu_w": 0.5360495049504952, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1909039999999999, "calib/std_conf": 0.3492250832686564, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8939076923076923, "calib/step_q_c_n": 1300.0, "calib/step_q_gap": 0.049399217731420975, "calib/step_q_w": 0.8445084745762713, "calib/step_q_w_n": 885.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 752.76953125, "completions/mean_terminated_length": 761.6956787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.19733333333333333, "grad_norm": 0.041517455130815506, "kl": 0.159759521484375, "learning_rate": 4.1666666666666667e-07, "loss": -0.0224, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01972859725356102, "mask/share_reasoning": 0.8467744588851929, "mask/share_step_conf": 0.12177818268537521, "num_tokens": 56107906.0, "reward": 0.6698368787765503, "reward_std": 0.28276172280311584, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7846717834472656, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.243283212184906, "step": 185 }, { "adv/mean_abs_final_conf": 0.6632397770881653, "adv/mean_abs_reasoning": 0.49668216705322266, "adv/mean_abs_step_conf": 0.7803338170051575, "adv/ratio_final_to_reasoning": 1.3353404271047544, "adv/ratio_step_to_reasoning": 1.5710928814594218, "adv/std_final_conf": 0.8465085029602051, "adv/std_reasoning": 0.7393675446510315, "adv/std_step_conf": 0.9363483786582947, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6806158437330441, "calib/avg_num_step_conf": 8.55859375, "calib/ece": 0.2633654618473895, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.2574725990233314, "calib/mean_conf": 0.7321204819277108, "calib/mu_c": 0.8324210526315788, "calib/mu_w": 0.5749484536082474, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19252208835341364, "calib/std_conf": 0.38435834176155914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8957460197119029, "calib/step_q_c_n": 1319.0, "calib/step_q_gap": 0.057707028886214706, "calib/step_q_w": 0.8380389908256882, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 744.859375, "completions/mean_terminated_length": 759.6972045898438, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.1984, "grad_norm": 0.03142472729086876, "kl": 0.168121337890625, "learning_rate": 3.8888888888888895e-07, "loss": -0.0179, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019757648929953575, "mask/share_reasoning": 0.8379254341125488, "mask/share_step_conf": 0.12278568744659424, "num_tokens": 56403630.0, "reward": 0.6432186365127563, "reward_std": 0.3723026514053345, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.702370285987854, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.2707856297492981, "step": 186 }, { "adv/mean_abs_final_conf": 0.5947656035423279, "adv/mean_abs_reasoning": 0.4806092381477356, "adv/mean_abs_step_conf": 0.766196072101593, "adv/ratio_final_to_reasoning": 1.2375242844572651, "adv/ratio_step_to_reasoning": 1.5942183613750476, "adv/std_final_conf": 0.8132778406143188, "adv/std_reasoning": 0.7207170128822327, "adv/std_step_conf": 0.9363957643508911, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6926406926406926, "calib/avg_num_step_conf": 8.890625, "calib/ece": 0.25153225806451607, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6129032258064516, "calib/gap": 0.26433832833832815, "calib/mean_conf": 0.714516129032258, "calib/mu_c": 0.8264335664335662, "calib/mu_w": 0.5620952380952381, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1947177419354838, "calib/std_conf": 0.37622992892282464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8784045124899276, "calib/step_q_c_n": 1241.0, "calib/step_q_gap": -0.001010946447270511, "calib/step_q_w": 0.8794154589371981, "calib/step_q_w_n": 1035.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 803.65234375, "completions/mean_terminated_length": 816.4087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.19946666666666665, "grad_norm": 0.04483392834663391, "kl": 0.16119384765625, "learning_rate": 3.611111111111111e-07, "loss": 0.01, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018417365849018097, "mask/share_reasoning": 0.8510687351226807, "mask/share_step_conf": 0.11488892883062363, "num_tokens": 56710909.0, "reward": 0.641819953918457, "reward_std": 0.36996394395828247, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7017328143119812, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2764383852481842, "step": 187 }, { "adv/mean_abs_final_conf": 0.7041361331939697, "adv/mean_abs_reasoning": 0.5235980749130249, "adv/mean_abs_step_conf": 0.7775394916534424, "adv/ratio_final_to_reasoning": 1.3448027541180974, "adv/ratio_step_to_reasoning": 1.4849930297826244, "adv/std_final_conf": 0.8908984661102295, "adv/std_reasoning": 0.7754470109939575, "adv/std_step_conf": 0.9361128211021423, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6960666265302027, "calib/avg_num_step_conf": 8.23828125, "calib/ece": 0.23476000000000008, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.664, "calib/gap": 0.2933968827346308, "calib/mean_conf": 0.75004, "calib/mu_c": 0.8662251655629136, "calib/mu_w": 0.5728282828282828, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19040000000000007, "calib/std_conf": 0.36654112784242915, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8885902720527616, "calib/step_q_c_n": 1213.0, "calib/step_q_gap": 0.031045629195618818, "calib/step_q_w": 0.8575446428571428, "calib/step_q_w_n": 896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 778.60546875, "completions/mean_terminated_length": 790.96435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.20053333333333334, "grad_norm": 0.051865581423044205, "kl": 0.163238525390625, "learning_rate": 3.3333333333333335e-07, "loss": -0.0086, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019419366493821144, "mask/share_reasoning": 0.8485928177833557, "mask/share_step_conf": 0.11636281758546829, "num_tokens": 57014304.0, "reward": 0.6489625573158264, "reward_std": 0.33630305528640747, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7240316867828369, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2621746361255646, "step": 188 }, { "adv/mean_abs_final_conf": 0.5933226943016052, "adv/mean_abs_reasoning": 0.4214906394481659, "adv/mean_abs_step_conf": 0.7758980989456177, "adv/ratio_final_to_reasoning": 1.4076770366203373, "adv/ratio_step_to_reasoning": 1.8408430136466556, "adv/std_final_conf": 0.7960253953933716, "adv/std_reasoning": 0.7015754580497742, "adv/std_step_conf": 0.9360262751579285, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7496066933638443, "calib/avg_num_step_conf": 8.31640625, "calib/ece": 0.17793032786885243, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.569672131147541, "calib/gap": 0.3891518878718536, "calib/mean_conf": 0.6594877049180328, "calib/mu_c": 0.806217105263158, "calib/mu_w": 0.4170652173913044, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10723360655737701, "calib/std_conf": 0.4012795158599938, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8838832684824902, "calib/step_q_c_n": 1285.0, "calib/step_q_gap": 0.07731336326922011, "calib/step_q_w": 0.8065699052132701, "calib/step_q_w_n": 844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 710.45703125, "completions/mean_terminated_length": 730.4296875, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.2016, "grad_norm": 0.04199478030204773, "kl": 0.18646240234375, "learning_rate": 3.055555555555556e-07, "loss": -0.1269, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019616516306996346, "mask/share_reasoning": 0.8344583511352539, "mask/share_step_conf": 0.11858132481575012, "num_tokens": 57303949.0, "reward": 0.6880927085876465, "reward_std": 0.2997852861881256, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7487436532974243, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.31728553771972656, "step": 189 }, { "adv/mean_abs_final_conf": 0.6792967319488525, "adv/mean_abs_reasoning": 0.5358800888061523, "adv/mean_abs_step_conf": 0.7768614292144775, "adv/ratio_final_to_reasoning": 1.2676282365000864, "adv/ratio_step_to_reasoning": 1.4496926559544128, "adv/std_final_conf": 0.8648825883865356, "adv/std_reasoning": 0.7754979729652405, "adv/std_step_conf": 0.9365853667259216, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7335120470253859, "calib/avg_num_step_conf": 9.109375, "calib/ece": 0.21571999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.592, "calib/gap": 0.3768548543375751, "calib/mean_conf": 0.67572, "calib/mu_c": 0.8460583941605839, "calib/mu_w": 0.46920353982300883, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17171999999999998, "calib/std_conf": 0.4000196015197255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8913212669683259, "calib/step_q_c_n": 1105.0, "calib/step_q_gap": 0.04610529304819555, "calib/step_q_w": 0.8452159739201304, "calib/step_q_w_n": 1227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 802.4140625, "completions/mean_terminated_length": 815.1508178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.20266666666666666, "grad_norm": 0.02791653200984001, "kl": 0.164642333984375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0435, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01869148015975952, "mask/share_reasoning": 0.851791262626648, "mask/share_step_conf": 0.11389227211475372, "num_tokens": 57614975.0, "reward": 0.6310009360313416, "reward_std": 0.34945589303970337, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7447917461395264, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.21486634016036987, "step": 190 }, { "adv/mean_abs_final_conf": 0.6342035531997681, "adv/mean_abs_reasoning": 0.44904839992523193, "adv/mean_abs_step_conf": 0.7958725690841675, "adv/ratio_final_to_reasoning": 1.4123278321565451, "adv/ratio_step_to_reasoning": 1.7723536465483074, "adv/std_final_conf": 0.8569350838661194, "adv/std_reasoning": 0.7015075087547302, "adv/std_step_conf": 0.9364147782325745, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6329937938478143, "calib/avg_num_step_conf": 8.7421875, "calib/ece": 0.29515918367346944, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7020408163265306, "calib/gap": 0.2214092687533732, "calib/mean_conf": 0.7685469387755102, "calib/mu_c": 0.8670514705882354, "calib/mu_w": 0.6456422018348622, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25430204081632657, "calib/std_conf": 0.362472511048961, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8939222316145393, "calib/step_q_c_n": 1183.0, "calib/step_q_gap": 0.05395066763349665, "calib/step_q_w": 0.8399715639810427, "calib/step_q_w_n": 1055.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 729.98046875, "completions/mean_terminated_length": 753.5281982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.20373333333333332, "grad_norm": 0.02957489714026451, "kl": 0.1614990234375, "learning_rate": 2.5000000000000004e-07, "loss": -0.0753, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019972363486886024, "mask/share_reasoning": 0.8223469257354736, "mask/share_step_conf": 0.1264307200908661, "num_tokens": 57906018.0, "reward": 0.5725820660591125, "reward_std": 0.3514135479927063, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6559983491897583, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.1915094554424286, "step": 191 }, { "adv/mean_abs_final_conf": 0.6305051445960999, "adv/mean_abs_reasoning": 0.48335394263267517, "adv/mean_abs_step_conf": 0.7706685066223145, "adv/ratio_final_to_reasoning": 1.3044377814773558, "adv/ratio_step_to_reasoning": 1.5944185795293782, "adv/std_final_conf": 0.8294618129730225, "adv/std_reasoning": 0.7755149006843567, "adv/std_step_conf": 0.9359073042869568, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.78238244983528, "calib/avg_num_step_conf": 8.9296875, "calib/ece": 0.16715637860082297, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6954732510288066, "calib/gap": 0.40719047619047605, "calib/mean_conf": 0.7602427983539094, "calib/mu_c": 0.9009999999999999, "calib/mu_w": 0.49380952380952386, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1365390946502057, "calib/std_conf": 0.36608740256453653, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.894045718432511, "calib/step_q_c_n": 1378.0, "calib/step_q_gap": 0.08005893429154187, "calib/step_q_w": 0.8139867841409691, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 800.26953125, "completions/mean_terminated_length": 816.211181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.2048, "grad_norm": 0.04933116212487221, "kl": 0.1651611328125, "learning_rate": 2.2222222222222224e-07, "loss": -0.0314, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01982063613831997, "mask/share_reasoning": 0.8341763615608215, "mask/share_step_conf": 0.12647175788879395, "num_tokens": 58215863.0, "reward": 0.7209115028381348, "reward_std": 0.3499925136566162, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7715025544166565, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.35625797510147095, "step": 192 }, { "adv/mean_abs_final_conf": 0.7361017465591431, "adv/mean_abs_reasoning": 0.6458040475845337, "adv/mean_abs_step_conf": 0.7798423767089844, "adv/ratio_final_to_reasoning": 1.1398221322897326, "adv/ratio_step_to_reasoning": 1.2075526309037348, "adv/std_final_conf": 0.9041551947593689, "adv/std_reasoning": 0.8430927991867065, "adv/std_step_conf": 0.9367273449897766, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7159560723514211, "calib/avg_num_step_conf": 8.09765625, "calib/ece": 0.23799196787148594, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5261044176706827, "calib/gap": 0.31876162790697676, "calib/mean_conf": 0.6362248995983937, "calib/mu_c": 0.7898449612403101, "calib/mu_w": 0.47108333333333335, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1780722891566265, "calib/std_conf": 0.40659745644660067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8885907335907336, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.060432585085429924, "calib/step_q_w": 0.8281581485053037, "calib/step_q_w_n": 1037.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 782.04296875, "completions/mean_terminated_length": 797.6215209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.20586666666666667, "grad_norm": 0.03797115013003349, "kl": 0.1591644287109375, "learning_rate": 1.9444444444444447e-07, "loss": 0.0038, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018755977973341942, "mask/share_reasoning": 0.850509524345398, "mask/share_step_conf": 0.11120319366455078, "num_tokens": 58521778.0, "reward": 0.5878036022186279, "reward_std": 0.4244157671928406, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.706500768661499, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.1745751053094864, "step": 193 }, { "adv/mean_abs_final_conf": 0.5197247266769409, "adv/mean_abs_reasoning": 0.38382184505462646, "adv/mean_abs_step_conf": 0.7647292613983154, "adv/ratio_final_to_reasoning": 1.3540780270153003, "adv/ratio_step_to_reasoning": 1.9924068190789852, "adv/std_final_conf": 0.7646445631980896, "adv/std_reasoning": 0.6614271998405457, "adv/std_step_conf": 0.9360878467559814, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7661644029428409, "calib/avg_num_step_conf": 8.5546875, "calib/ece": 0.16677551020408168, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6530612244897959, "calib/gap": 0.42447439162422196, "calib/mean_conf": 0.718938775510204, "calib/mu_c": 0.8800657894736843, "calib/mu_w": 0.45559139784946234, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13265306122448983, "calib/std_conf": 0.3857615001877093, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8965129007036746, "calib/step_q_c_n": 1279.0, "calib/step_q_gap": 0.057533756905650546, "calib/step_q_w": 0.8389791437980241, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 730.33203125, "completions/mean_terminated_length": 753.89111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.20693333333333333, "grad_norm": 0.036260880529880524, "kl": 0.17108154296875, "learning_rate": 1.6666666666666668e-07, "loss": -0.0248, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019665461033582687, "mask/share_reasoning": 0.8306401968002319, "mask/share_step_conf": 0.11844430863857269, "num_tokens": 58814687.0, "reward": 0.6782735586166382, "reward_std": 0.28763946890830994, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7712780833244324, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.27511268854141235, "step": 194 }, { "adv/mean_abs_final_conf": 0.6894845962524414, "adv/mean_abs_reasoning": 0.49503880739212036, "adv/mean_abs_step_conf": 0.7993884086608887, "adv/ratio_final_to_reasoning": 1.3927889813016225, "adv/ratio_step_to_reasoning": 1.6147994798066263, "adv/std_final_conf": 0.8758496642112732, "adv/std_reasoning": 0.7577955722808838, "adv/std_step_conf": 0.9363441467285156, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6927521008403362, "calib/avg_num_step_conf": 8.203125, "calib/ece": 0.22973140495867767, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5909090909090909, "calib/gap": 0.29902380952380936, "calib/mean_conf": 0.6888223140495867, "calib/mu_c": 0.8148571428571428, "calib/mu_w": 0.5158333333333335, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17002066115702477, "calib/std_conf": 0.39343785336538334, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8926034482758621, "calib/step_q_c_n": 1160.0, "calib/step_q_gap": 0.024358767424798256, "calib/step_q_w": 0.8682446808510639, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 765.34375, "completions/mean_terminated_length": 790.0322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.208, "grad_norm": 0.03882647678256035, "kl": 0.1652374267578125, "learning_rate": 1.3888888888888888e-07, "loss": -0.1117, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018789615482091904, "mask/share_reasoning": 0.8310298919677734, "mask/share_step_conf": 0.11893048882484436, "num_tokens": 59116599.0, "reward": 0.5893170237541199, "reward_std": 0.33620980381965637, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.694831132888794, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.18536534905433655, "step": 195 }, { "adv/mean_abs_final_conf": 0.5781576633453369, "adv/mean_abs_reasoning": 0.42664217948913574, "adv/mean_abs_step_conf": 0.7782294750213623, "adv/ratio_final_to_reasoning": 1.355134797121154, "adv/ratio_step_to_reasoning": 1.8240800193577194, "adv/std_final_conf": 0.794323205947876, "adv/std_reasoning": 0.7014920711517334, "adv/std_step_conf": 0.9358806014060974, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6818520778252322, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.24919354838709687, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7459677419354839, "calib/gap": 0.3051901565995525, "calib/mean_conf": 0.7866935483870968, "calib/mu_c": 0.9085234899328859, "calib/mu_w": 0.6033333333333334, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21754032258064526, "calib/std_conf": 0.35887530805639195, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8958135731807031, "calib/step_q_c_n": 1223.0, "calib/step_q_gap": 0.027497661450157196, "calib/step_q_w": 0.8683159117305459, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 700.89453125, "completions/mean_terminated_length": 706.4133911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.20906666666666668, "grad_norm": 0.019181951880455017, "kl": 0.175933837890625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0149, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02079775743186474, "mask/share_reasoning": 0.8414372205734253, "mask/share_step_conf": 0.12995250523090363, "num_tokens": 59398572.0, "reward": 0.656751275062561, "reward_std": 0.3294701874256134, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7199835777282715, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2833626866340637, "step": 196 }, { "adv/mean_abs_final_conf": 0.7014039754867554, "adv/mean_abs_reasoning": 0.5561388731002808, "adv/mean_abs_step_conf": 0.7772125005722046, "adv/ratio_final_to_reasoning": 1.2612029286438335, "adv/ratio_step_to_reasoning": 1.3975151498393112, "adv/std_final_conf": 0.8776654601097107, "adv/std_reasoning": 0.7929669618606567, "adv/std_step_conf": 0.9363115429878235, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7232647296206618, "calib/avg_num_step_conf": 8.578125, "calib/ece": 0.2052049180327868, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5286885245901639, "calib/gap": 0.34411218724778025, "calib/mean_conf": 0.6516803278688524, "calib/mu_c": 0.818095238095238, "calib/mu_w": 0.47398305084745773, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17024590163934417, "calib/std_conf": 0.391695615480666, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8928584729981378, "calib/step_q_c_n": 1074.0, "calib/step_q_gap": 0.03971230544020554, "calib/step_q_w": 0.8531461675579323, "calib/step_q_w_n": 1122.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 747.97265625, "completions/mean_terminated_length": 781.5550537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.21013333333333334, "grad_norm": 0.021937143057584763, "kl": 0.1743927001953125, "learning_rate": 8.333333333333334e-08, "loss": -0.1275, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018995754420757294, "mask/share_reasoning": 0.8196362257003784, "mask/share_step_conf": 0.11839927732944489, "num_tokens": 59695109.0, "reward": 0.6292377710342407, "reward_std": 0.3564552962779999, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7152363061904907, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2541767358779907, "step": 197 }, { "adv/mean_abs_final_conf": 0.5963702201843262, "adv/mean_abs_reasoning": 0.4490445554256439, "adv/mean_abs_step_conf": 0.8115639090538025, "adv/ratio_final_to_reasoning": 1.3280869637068287, "adv/ratio_step_to_reasoning": 1.8073126580602472, "adv/std_final_conf": 0.8019877076148987, "adv/std_reasoning": 0.6818601489067078, "adv/std_step_conf": 0.9362412691116333, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7639022051773731, "calib/avg_num_step_conf": 8.68359375, "calib/ece": 0.17732388663967605, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.611336032388664, "calib/gap": 0.4143632379126146, "calib/mean_conf": 0.6918987854251011, "calib/mu_c": 0.8563020134228186, "calib/mu_w": 0.44193877551020405, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13299190283400802, "calib/std_conf": 0.3922373820082409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908143708116158, "calib/step_q_c_n": 1343.0, "calib/step_q_gap": 0.025518708116157973, "calib/step_q_w": 0.882625, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 741.4921875, "completions/mean_terminated_length": 750.2846069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.2112, "grad_norm": 0.0298744086176157, "kl": 0.18133544921875, "learning_rate": 5.555555555555556e-08, "loss": 0.0256, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020268086344003677, "mask/share_reasoning": 0.8358542919158936, "mask/share_step_conf": 0.13215884566307068, "num_tokens": 59990315.0, "reward": 0.7111806273460388, "reward_std": 0.30690059065818787, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7692664265632629, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3437197804450989, "step": 198 }, { "adv/mean_abs_final_conf": 0.7013013362884521, "adv/mean_abs_reasoning": 0.5438744425773621, "adv/mean_abs_step_conf": 0.7945132851600647, "adv/ratio_final_to_reasoning": 1.2894544795395442, "adv/ratio_step_to_reasoning": 1.4608395301587482, "adv/std_final_conf": 0.8762961626052856, "adv/std_reasoning": 0.7754784226417542, "adv/std_step_conf": 0.9364268779754639, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6499397227245329, "calib/avg_num_step_conf": 8.31640625, "calib/ece": 0.2786772727272726, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.640495867768595, "calib/gap": 0.21296984629294757, "calib/mean_conf": 0.7159500000000001, "calib/mu_c": 0.7898734177215191, "calib/mu_w": 0.5769035714285715, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17086735537190073, "calib/std_conf": 0.38994052632452547, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8802288329519451, "calib/step_q_c_n": 1311.0, "calib/step_q_gap": 0.01898188918666388, "calib/step_q_w": 0.8612469437652812, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 754.75, "completions/mean_terminated_length": 785.4308471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.21226666666666666, "grad_norm": 0.031152119860053062, "kl": 0.1701507568359375, "learning_rate": 2.777777777777778e-08, "loss": -0.0671, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01972189173102379, "mask/share_reasoning": 0.8208757638931274, "mask/share_step_conf": 0.12033981829881668, "num_tokens": 60287731.0, "reward": 0.6673556566238403, "reward_std": 0.3885536789894104, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6748343706130981, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.34737688302993774, "step": 199 }, { "adv/mean_abs_final_conf": 0.5053526163101196, "adv/mean_abs_reasoning": 0.37649911642074585, "adv/mean_abs_step_conf": 0.742881178855896, "adv/ratio_final_to_reasoning": 1.3422411747319407, "adv/ratio_step_to_reasoning": 1.9731286116106321, "adv/std_final_conf": 0.7446314692497253, "adv/std_reasoning": 0.6612992286682129, "adv/std_step_conf": 0.9360516667366028, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7355482933914307, "calib/avg_num_step_conf": 7.91015625, "calib/ece": 0.19168016194331977, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.37423565722585317, "calib/mean_conf": 0.7230971659919027, "calib/mu_c": 0.8518827160493827, "calib/mu_w": 0.47764705882352954, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1294534412955465, "calib/std_conf": 0.3892157362494419, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9031725697061039, "calib/step_q_c_n": 1327.0, "calib/step_q_gap": 0.04088030609578874, "calib/step_q_w": 0.8622922636103152, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 737.1328125, "completions/mean_terminated_length": 754.8240356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.21333333333333335, "grad_norm": 0.023639973253011703, "kl": 0.168212890625, "learning_rate": 0.0, "loss": -0.0846, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01986561343073845, "mask/share_reasoning": 0.8402686715126038, "mask/share_step_conf": 0.1164281889796257, "num_tokens": 60584485.0, "reward": 0.7310245037078857, "reward_std": 0.28140828013420105, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7594671249389648, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.3830506503582001, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.025999896985013038, "train_runtime": 20057.5097, "train_samples_per_second": 2.553, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 60584485, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }