{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.13436700403690338, "adv/mean_abs_reasoning": 0.15610459446907043, "adv/mean_abs_step_conf": 0.13763591647148132, "adv/ratio_final_to_reasoning": 0.8607498356720436, "adv/ratio_step_to_reasoning": 0.8816903624111565, "adv/std_final_conf": 0.40337812900543213, "adv/std_reasoning": 0.43819621205329895, "adv/std_step_conf": 0.4055573642253876, "calib/answer_extract_rate": 0.08203125, "calib/auroc": 0.6944444444444445, "calib/avg_num_step_conf": 0.3359375, "calib/ece": 0.6230769230769231, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.04296875, "calib/frac_conf_gt_0.9": 0.7692307692307693, "calib/gap": 0.03861111111111115, "calib/mean_conf": 0.9307692307692309, "calib/mu_c": 0.9575, "calib/mu_w": 0.9188888888888889, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.6230769230769231, "calib/std_conf": 0.07965903671384378, "calib/step_conf_rate": 0.0703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 613.67578125, "completions/mean_terminated_length": 674.2532348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0010666666666666667, "grad_norm": 0.187005877494812, "learning_rate": 2.5000000000000004e-07, "loss": -0.905, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0006368102040141821, "mask/share_reasoning": 0.9053931832313538, "mask/share_step_conf": 0.004126261919736862, "num_tokens": 264685.0, "reward": 0.05631820484995842, "reward_std": 0.11161534488201141, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.01655624993145466, "rewards/format_reward_step": 0.04296875, "rewards/stepwise_brier_reward": 0.030139535665512085, "step": 1 }, { "adv/mean_abs_final_conf": 0.24564749002456665, "adv/mean_abs_reasoning": 0.308665931224823, "adv/mean_abs_step_conf": 0.2689768671989441, "adv/ratio_final_to_reasoning": 0.7958360971352048, "adv/ratio_step_to_reasoning": 0.8714174127724819, "adv/std_final_conf": 0.5673569440841675, "adv/std_reasoning": 0.6196993589401245, "adv/std_step_conf": 0.5733745098114014, "calib/answer_extract_rate": 0.13671875, "calib/auroc": 0.5338345864661654, "calib/avg_num_step_conf": 0.55078125, "calib/ece": 0.6261538461538463, "calib/final_conf_rate": 0.1015625, "calib/format_rate": 0.08984375, "calib/frac_conf_gt_0.9": 0.7692307692307693, "calib/gap": 0.002406015037593856, "calib/mean_conf": 0.8953846153846153, "calib/mu_c": 0.897142857142857, "calib/mu_w": 0.8947368421052632, "calib/nonempty_final_conf_rate": 0.1015625, "calib/nonempty_reasoning_rate": 0.14453125, "calib/nonempty_step_conf_rate": 0.109375, "calib/pce": 0.6261538461538463, "calib/std_conf": 0.18653172073466937, "calib/step_conf_rate": 0.109375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 646.4609375, "completions/mean_terminated_length": 683.8594970703125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0021333333333333334, "grad_norm": 0.18600612878799438, "learning_rate": 5.000000000000001e-07, "loss": -1.4124, "mask/has_final_conf_rate": 0.1015625, "mask/share_final_conf": 0.003081148024648428, "mask/share_reasoning": 0.934806227684021, "mask/share_step_conf": 0.007425096817314625, "num_tokens": 533467.0, "reward": 0.10996345430612564, "reward_std": 0.20499171316623688, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.02965039201080799, "rewards/format_reward_step": 0.08984375, "rewards/stepwise_brier_reward": 0.05275794863700867, "step": 2 }, { "adv/mean_abs_final_conf": 0.11627759039402008, "adv/mean_abs_reasoning": 0.15320944786071777, "adv/mean_abs_step_conf": 0.14957186579704285, "adv/ratio_final_to_reasoning": 0.758945300160129, "adv/ratio_step_to_reasoning": 0.9762574559567512, "adv/std_final_conf": 0.397796094417572, "adv/std_reasoning": 0.4381342828273773, "adv/std_step_conf": 0.43789950013160706, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 0.26953125, "calib/ece": 0.974, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.974, "calib/mu_c": NaN, "calib/mu_w": 0.974, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.974, "calib/std_conf": 0.02537715508089904, "calib/step_conf_rate": 0.05078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 684.71875, "completions/mean_terminated_length": 762.1217041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.1434255689382553, "learning_rate": 7.5e-07, "loss": -0.7424, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.000625855871476233, "mask/share_reasoning": 0.8943076133728027, "mask/share_step_conf": 0.0035040113143622875, "num_tokens": 814011.0, "reward": 0.030487660318613052, "reward_std": 0.06891795992851257, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0019796874839812517, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.020922981202602386, "step": 3 }, { "adv/mean_abs_final_conf": 0.13574722409248352, "adv/mean_abs_reasoning": 0.1447548270225525, "adv/mean_abs_step_conf": 0.14417070150375366, "adv/ratio_final_to_reasoning": 0.9377733847268139, "adv/ratio_step_to_reasoning": 0.9959647251092509, "adv/std_final_conf": 0.4322591722011566, "adv/std_reasoning": 0.43817517161369324, "adv/std_step_conf": 0.43792372941970825, "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.40476190476190477, "calib/avg_num_step_conf": 0.2109375, "calib/ece": 0.64, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": -0.02857142857142858, "calib/mean_conf": 0.9399999999999998, "calib/mu_c": 0.9199999999999999, "calib/mu_w": 0.9485714285714285, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.64, "calib/std_conf": 0.05848076606885379, "calib/step_conf_rate": 0.046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 707.953125, "completions/mean_terminated_length": 774.5128784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.004266666666666667, "grad_norm": 0.18157421052455902, "learning_rate": 1.0000000000000002e-06, "loss": -0.9007, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0004933603922836483, "mask/share_reasoning": 0.9104673862457275, "mask/share_step_conf": 0.003101823152974248, "num_tokens": 1101415.0, "reward": 0.04077763482928276, "reward_std": 0.09365655481815338, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.010477343574166298, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.017722850665450096, "step": 4 }, { "adv/mean_abs_final_conf": 0.19167664647102356, "adv/mean_abs_reasoning": 0.2151871919631958, "adv/mean_abs_step_conf": 0.19849944114685059, "adv/ratio_final_to_reasoning": 0.8907437506959367, "adv/ratio_step_to_reasoning": 0.9224500739839601, "adv/std_final_conf": 0.519432544708252, "adv/std_reasoning": 0.5492884516716003, "adv/std_step_conf": 0.5234185457229614, "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.5104166666666666, "calib/avg_num_step_conf": 0.35546875, "calib/ece": 0.6356250000000001, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.04296875, "calib/frac_conf_gt_0.9": 0.6875, "calib/gap": 0.07916666666666672, "calib/mean_conf": 0.885625, "calib/mu_c": 0.945, "calib/mu_w": 0.8658333333333332, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.6356250000000001, "calib/std_conf": 0.23192048071483465, "calib/step_conf_rate": 0.0703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 655.25390625, "completions/mean_terminated_length": 732.5109252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.005333333333333333, "grad_norm": 0.1356576830148697, "learning_rate": 1.25e-06, "loss": -1.0562, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.0008796079782769084, "mask/share_reasoning": 0.8888272643089294, "mask/share_step_conf": 0.004824398085474968, "num_tokens": 1375848.0, "reward": 0.059928152710199356, "reward_std": 0.16126090288162231, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.018272656947374344, "rewards/format_reward_step": 0.04296875, "rewards/stepwise_brier_reward": 0.02868872694671154, "step": 5 }, { "adv/mean_abs_final_conf": 0.1842029094696045, "adv/mean_abs_reasoning": 0.20916666090488434, "adv/mean_abs_step_conf": 0.20033016800880432, "adv/ratio_final_to_reasoning": 0.880651384272794, "adv/ratio_step_to_reasoning": 0.9577538176597928, "adv/std_final_conf": 0.49309083819389343, "adv/std_reasoning": 0.49686720967292786, "adv/std_step_conf": 0.4966629445552826, "calib/answer_extract_rate": 0.10546875, "calib/auroc": 0.4736842105263158, "calib/avg_num_step_conf": 0.49609375, "calib/ece": 0.7418363636363637, "calib/final_conf_rate": 0.0859375, "calib/format_rate": 0.05859375, "calib/frac_conf_gt_0.9": 0.7272727272727273, "calib/gap": 0.04454035087719299, "calib/mean_conf": 0.8782, "calib/mu_c": 0.9166666666666666, "calib/mu_w": 0.8721263157894736, "calib/nonempty_final_conf_rate": 0.0859375, "calib/nonempty_reasoning_rate": 0.13671875, "calib/nonempty_step_conf_rate": 0.09375, "calib/pce": 0.7418363636363637, "calib/std_conf": 0.23882599980282337, "calib/step_conf_rate": 0.09375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 534.37890625, "completions/mean_terminated_length": 577.2193603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.1741025298833847, "learning_rate": 1.5e-06, "loss": -0.9792, "mask/has_final_conf_rate": 0.0859375, "mask/share_final_conf": 0.0018007527105510235, "mask/share_reasoning": 0.9179291725158691, "mask/share_step_conf": 0.006051314529031515, "num_tokens": 1618601.0, "reward": 0.06880377233028412, "reward_std": 0.1430698037147522, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.021675746887922287, "rewards/format_reward_step": 0.05859375, "rewards/stepwise_brier_reward": 0.03692592680454254, "step": 6 }, { "adv/mean_abs_final_conf": 0.10781514644622803, "adv/mean_abs_reasoning": 0.1530671864748001, "adv/mean_abs_step_conf": 0.11152852326631546, "adv/ratio_final_to_reasoning": 0.7043648539524043, "adv/ratio_step_to_reasoning": 0.7286246375520642, "adv/std_final_conf": 0.36762505769729614, "adv/std_reasoning": 0.43820399045944214, "adv/std_step_conf": 0.37017136812210083, "calib/answer_extract_rate": 0.1015625, "calib/auroc": 0.52, "calib/avg_num_step_conf": 0.28125, "calib/ece": 0.6213333333333335, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.8666666666666667, "calib/gap": 0.008000000000000007, "calib/mean_conf": 0.9346666666666666, "calib/mu_c": 0.9400000000000001, "calib/mu_w": 0.932, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.12109375, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.6113333333333335, "calib/std_conf": 0.07499925925560123, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 626.88671875, "completions/mean_terminated_length": 697.7521362304688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.007466666666666667, "grad_norm": 0.0667913407087326, "learning_rate": 1.75e-06, "loss": -0.5539, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.001980168977752328, "mask/share_reasoning": 0.893886387348175, "mask/share_step_conf": 0.0025709576439112425, "num_tokens": 1886508.0, "reward": 0.05245732516050339, "reward_std": 0.10757142305374146, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.013073046691715717, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.02025311440229416, "step": 7 }, { "adv/mean_abs_final_conf": 0.20839053392410278, "adv/mean_abs_reasoning": 0.21437883377075195, "adv/mean_abs_step_conf": 0.21140068769454956, "adv/ratio_final_to_reasoning": 0.9720667393262675, "adv/ratio_step_to_reasoning": 0.9861080218423658, "adv/std_final_conf": 0.521725058555603, "adv/std_reasoning": 0.5237656831741333, "adv/std_step_conf": 0.5234709978103638, "calib/answer_extract_rate": 0.09375, "calib/auroc": 0.45714285714285713, "calib/avg_num_step_conf": 0.43359375, "calib/ece": 0.36470588235294127, "calib/final_conf_rate": 0.06640625, "calib/format_rate": 0.05859375, "calib/frac_conf_gt_0.9": 0.8235294117647058, "calib/gap": 0.0345714285714287, "calib/mean_conf": 0.9317647058823529, "calib/mu_c": 0.9460000000000001, "calib/mu_w": 0.9114285714285714, "calib/nonempty_final_conf_rate": 0.06640625, "calib/nonempty_reasoning_rate": 0.12890625, "calib/nonempty_step_conf_rate": 0.1015625, "calib/pce": 0.35411764705882365, "calib/std_conf": 0.08806650522508859, "calib/step_conf_rate": 0.1015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 2939.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 635.58984375, "completions/mean_terminated_length": 716.7885131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 6.0, "epoch": 0.008533333333333334, "grad_norm": 0.1552148461341858, "learning_rate": 2.0000000000000003e-06, "loss": -1.2229, "mask/has_final_conf_rate": 0.06640625, "mask/share_final_conf": 0.0010213814675807953, "mask/share_reasoning": 0.8800764083862305, "mask/share_step_conf": 0.00562096806243062, "num_tokens": 2155731.0, "reward": 0.10557056963443756, "reward_std": 0.2017257809638977, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.04262109100818634, "rewards/format_reward_step": 0.05859375, "rewards/stepwise_brier_reward": 0.04529935121536255, "step": 8 }, { "adv/mean_abs_final_conf": 0.1774379312992096, "adv/mean_abs_reasoning": 0.20591090619564056, "adv/mean_abs_step_conf": 0.18646956980228424, "adv/ratio_final_to_reasoning": 0.86172187077173, "adv/ratio_step_to_reasoning": 0.9055837461329772, "adv/std_final_conf": 0.4876402020454407, "adv/std_reasoning": 0.5237042903900146, "adv/std_step_conf": 0.49650344252586365, "calib/answer_extract_rate": 0.08203125, "calib/auroc": 0.24999999999999997, "calib/avg_num_step_conf": 0.26953125, "calib/ece": 0.7626666666666667, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": -0.1742307692307693, "calib/mean_conf": 0.896, "calib/mu_c": 0.745, "calib/mu_w": 0.9192307692307693, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.7626666666666667, "calib/std_conf": 0.14627827362029308, "calib/step_conf_rate": 0.06640625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 669.16796875, "completions/mean_terminated_length": 732.0812377929688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0096, "grad_norm": 0.16828308999538422, "learning_rate": 2.25e-06, "loss": -1.0588, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.0011211318196728826, "mask/share_reasoning": 0.909247875213623, "mask/share_step_conf": 0.0036935298703610897, "num_tokens": 2434574.0, "reward": 0.0528482049703598, "reward_std": 0.1220763698220253, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.010931640863418579, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.026011832058429718, "step": 9 }, { "adv/mean_abs_final_conf": 0.13913732767105103, "adv/mean_abs_reasoning": 0.16954657435417175, "adv/mean_abs_step_conf": 0.14402230083942413, "adv/ratio_final_to_reasoning": 0.820643697468061, "adv/ratio_step_to_reasoning": 0.8494556813549705, "adv/std_final_conf": 0.43238168954849243, "adv/std_reasoning": 0.4683932960033417, "adv/std_step_conf": 0.4379076659679413, "calib/answer_extract_rate": 0.08984375, "calib/auroc": 0.1428571428571429, "calib/avg_num_step_conf": 0.3984375, "calib/ece": 0.8740000000000001, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": -0.04785714285714271, "calib/mean_conf": 0.9246666666666667, "calib/mu_c": 0.88, "calib/mu_w": 0.9278571428571427, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.11328125, "calib/nonempty_step_conf_rate": 0.07421875, "calib/pce": 0.8660000000000001, "calib/std_conf": 0.09258269576738169, "calib/step_conf_rate": 0.07421875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 595.4765625, "completions/mean_terminated_length": 645.940673828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.010666666666666666, "grad_norm": 0.1307852417230606, "learning_rate": 2.5e-06, "loss": -0.9333, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.001026952755637467, "mask/share_reasoning": 0.9161863923072815, "mask/share_step_conf": 0.0046616545878350735, "num_tokens": 2693816.0, "reward": 0.034988999366760254, "reward_std": 0.08137714862823486, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.004281249828636646, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.020962368696928024, "step": 10 }, { "adv/mean_abs_final_conf": 0.23236548900604248, "adv/mean_abs_reasoning": 0.24977372586727142, "adv/mean_abs_step_conf": 0.2524455487728119, "adv/ratio_final_to_reasoning": 0.9303039709209463, "adv/ratio_step_to_reasoning": 1.0106969734156916, "adv/std_final_conf": 0.5680802464485168, "adv/std_reasoning": 0.5737043023109436, "adv/std_step_conf": 0.5733542442321777, "calib/answer_extract_rate": 0.1171875, "calib/auroc": 0.48214285714285715, "calib/avg_num_step_conf": 0.48828125, "calib/ece": 0.7459999999999999, "calib/final_conf_rate": 0.09765625, "calib/format_rate": 0.0703125, "calib/frac_conf_gt_0.9": 0.72, "calib/gap": -0.0635714285714285, "calib/mean_conf": 0.8483999999999999, "calib/mu_c": 0.7949999999999999, "calib/mu_w": 0.8585714285714284, "calib/nonempty_final_conf_rate": 0.09765625, "calib/nonempty_reasoning_rate": 0.140625, "calib/nonempty_step_conf_rate": 0.109375, "calib/pce": 0.7172, "calib/std_conf": 0.2857646584166768, "calib/step_conf_rate": 0.109375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 635.75390625, "completions/mean_terminated_length": 698.5107421875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.011733333333333333, "grad_norm": 0.22519375383853912, "learning_rate": 2.7500000000000004e-06, "loss": -1.1239, "mask/has_final_conf_rate": 0.09765625, "mask/share_final_conf": 0.0028018150478601456, "mask/share_reasoning": 0.9016100168228149, "mask/share_step_conf": 0.005744474474340677, "num_tokens": 2961049.0, "reward": 0.07517063617706299, "reward_std": 0.16879817843437195, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.019722266122698784, "rewards/format_reward_step": 0.0703125, "rewards/stepwise_brier_reward": 0.03891763836145401, "step": 11 }, { "adv/mean_abs_final_conf": 0.20684903860092163, "adv/mean_abs_reasoning": 0.22811613976955414, "adv/mean_abs_step_conf": 0.23255550861358643, "adv/ratio_final_to_reasoning": 0.906770730075843, "adv/ratio_step_to_reasoning": 1.0194610028405575, "adv/std_final_conf": 0.517840564250946, "adv/std_reasoning": 0.549264132976532, "adv/std_step_conf": 0.5490169525146484, "calib/answer_extract_rate": 0.1171875, "calib/auroc": 0.5750000000000001, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.6822222222222221, "calib/final_conf_rate": 0.09375, "calib/format_rate": 0.0625, "calib/frac_conf_gt_0.9": 0.7083333333333334, "calib/gap": 0.07033333333333336, "calib/mean_conf": 0.8163888888888889, "calib/mu_c": 0.875, "calib/mu_w": 0.8046666666666666, "calib/nonempty_final_conf_rate": 0.09375, "calib/nonempty_reasoning_rate": 0.12890625, "calib/nonempty_step_conf_rate": 0.0859375, "calib/pce": 0.6659722222222221, "calib/std_conf": 0.2886413202027649, "calib/step_conf_rate": 0.0859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 692.40234375, "completions/mean_terminated_length": 751.0805053710938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.1815670132637024, "learning_rate": 3e-06, "loss": -1.1994, "mask/has_final_conf_rate": 0.09375, "mask/share_final_conf": 0.0017455201596021652, "mask/share_reasoning": 0.9151567220687866, "mask/share_step_conf": 0.00497277919203043, "num_tokens": 3242480.0, "reward": 0.07485318183898926, "reward_std": 0.16080008447170258, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.024570703506469727, "rewards/format_reward_step": 0.0625, "rewards/stepwise_brier_reward": 0.043671008199453354, "step": 12 }, { "adv/mean_abs_final_conf": 0.1520506739616394, "adv/mean_abs_reasoning": 0.1872754991054535, "adv/mean_abs_step_conf": 0.1856890469789505, "adv/ratio_final_to_reasoning": 0.8119090574470756, "adv/ratio_step_to_reasoning": 0.9915287790763827, "adv/std_final_conf": 0.46135035157203674, "adv/std_reasoning": 0.4968181252479553, "adv/std_step_conf": 0.49658989906311035, "calib/answer_extract_rate": 0.1015625, "calib/auroc": 0.5769230769230769, "calib/avg_num_step_conf": 0.38671875, "calib/ece": 0.7679999999999998, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.0546875, "calib/frac_conf_gt_0.9": 0.9333333333333333, "calib/gap": 0.07923076923076933, "calib/mean_conf": 0.9013333333333333, "calib/mu_c": 0.97, "calib/mu_w": 0.8907692307692306, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.12890625, "calib/nonempty_step_conf_rate": 0.0859375, "calib/pce": 0.7679999999999998, "calib/std_conf": 0.23391926432472854, "calib/step_conf_rate": 0.0859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 635.34765625, "completions/mean_terminated_length": 701.0733032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.013866666666666666, "grad_norm": 0.15629175305366516, "learning_rate": 3.2500000000000002e-06, "loss": -0.8651, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.0008117250981740654, "mask/share_reasoning": 0.9016801714897156, "mask/share_step_conf": 0.003758120583370328, "num_tokens": 3509721.0, "reward": 0.055212050676345825, "reward_std": 0.11932926625013351, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.014823437668383121, "rewards/format_reward_step": 0.0546875, "rewards/stepwise_brier_reward": 0.03269988298416138, "step": 13 }, { "adv/mean_abs_final_conf": 0.2434786558151245, "adv/mean_abs_reasoning": 0.2808641195297241, "adv/mean_abs_step_conf": 0.2866297662258148, "adv/ratio_final_to_reasoning": 0.8668912790384281, "adv/ratio_step_to_reasoning": 1.0205282422893485, "adv/std_final_conf": 0.567785382270813, "adv/std_reasoning": 0.5971327424049377, "adv/std_step_conf": 0.5968067646026611, "calib/answer_extract_rate": 0.11328125, "calib/auroc": 0.27976190476190477, "calib/avg_num_step_conf": 0.453125, "calib/ece": 0.7948, "calib/final_conf_rate": 0.09765625, "calib/format_rate": 0.08203125, "calib/frac_conf_gt_0.9": 0.88, "calib/gap": -0.04738095238095241, "calib/mean_conf": 0.9548000000000001, "calib/mu_c": 0.915, "calib/mu_w": 0.9623809523809524, "calib/nonempty_final_conf_rate": 0.09765625, "calib/nonempty_reasoning_rate": 0.140625, "calib/nonempty_step_conf_rate": 0.109375, "calib/pce": 0.7948, "calib/std_conf": 0.04622726468221972, "calib/step_conf_rate": 0.109375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 567.47265625, "completions/mean_terminated_length": 639.9691162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.014933333333333333, "grad_norm": 0.13358238339424133, "learning_rate": 3.5e-06, "loss": -1.1973, "mask/has_final_conf_rate": 0.09765625, "mask/share_final_conf": 0.0016311781946569681, "mask/share_reasoning": 0.8782390356063843, "mask/share_step_conf": 0.0068485308438539505, "num_tokens": 3760394.0, "reward": 0.08635027706623077, "reward_std": 0.18285219371318817, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0193667970597744, "rewards/format_reward_step": 0.08203125, "rewards/stepwise_brier_reward": 0.0497359000146389, "step": 14 }, { "adv/mean_abs_final_conf": 0.1690009981393814, "adv/mean_abs_reasoning": 0.21810534596443176, "adv/mean_abs_step_conf": 0.17306694388389587, "adv/ratio_final_to_reasoning": 0.7748594945808518, "adv/ratio_step_to_reasoning": 0.7935016132622413, "adv/std_final_conf": 0.4642001986503601, "adv/std_reasoning": 0.5237811803817749, "adv/std_step_conf": 0.4681417942047119, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.46875, "calib/avg_num_step_conf": 0.33984375, "calib/ece": 0.39374999999999993, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.5625, "calib/gap": -0.0050000000000000044, "calib/mean_conf": 0.8724999999999999, "calib/mu_c": 0.87, "calib/mu_w": 0.875, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.38312499999999994, "calib/std_conf": 0.12695963925594622, "calib/step_conf_rate": 0.0703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 645.37109375, "completions/mean_terminated_length": 718.3260498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.016, "grad_norm": 0.21051011979579926, "learning_rate": 3.7500000000000005e-06, "loss": -1.0117, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.002414593007415533, "mask/share_reasoning": 0.8917868137359619, "mask/share_step_conf": 0.004236122127622366, "num_tokens": 4033489.0, "reward": 0.07319016009569168, "reward_std": 0.1653515249490738, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.02371484413743019, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.025147901847958565, "step": 15 }, { "adv/mean_abs_final_conf": 0.140500009059906, "adv/mean_abs_reasoning": 0.15322402119636536, "adv/mean_abs_step_conf": 0.15020135045051575, "adv/ratio_final_to_reasoning": 0.9169581111557384, "adv/ratio_step_to_reasoning": 0.9802728663413951, "adv/std_final_conf": 0.4350675642490387, "adv/std_reasoning": 0.43818145990371704, "adv/std_step_conf": 0.4379708468914032, "calib/answer_extract_rate": 0.0546875, "calib/auroc": 0.888888888888889, "calib/avg_num_step_conf": 0.2109375, "calib/ece": 0.7154545454545455, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.10111111111111104, "calib/mean_conf": 0.897272727272727, "calib/mu_c": 0.98, "calib/mu_w": 0.8788888888888889, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.7154545454545455, "calib/std_conf": 0.1878147495946185, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 704.78515625, "completions/mean_terminated_length": 801.888916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.017066666666666667, "grad_norm": 0.16761568188667297, "learning_rate": 4.000000000000001e-06, "loss": -0.9862, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.001200198195874691, "mask/share_reasoning": 0.8753113150596619, "mask/share_step_conf": 0.0023947488516569138, "num_tokens": 4322762.0, "reward": 0.04348303750157356, "reward_std": 0.1033376008272171, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.013904296793043613, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.025326423346996307, "step": 16 }, { "adv/mean_abs_final_conf": 0.16247119009494781, "adv/mean_abs_reasoning": 0.204107403755188, "adv/mean_abs_step_conf": 0.171944722533226, "adv/ratio_final_to_reasoning": 0.7960083128087808, "adv/ratio_step_to_reasoning": 0.8424227606141188, "adv/std_final_conf": 0.4362327754497528, "adv/std_reasoning": 0.46846750378608704, "adv/std_step_conf": 0.43806710839271545, "calib/answer_extract_rate": 0.1171875, "calib/auroc": 0.5491071428571428, "calib/avg_num_step_conf": 0.5625, "calib/ece": 0.6430434782608694, "calib/final_conf_rate": 0.08984375, "calib/format_rate": 0.07421875, "calib/frac_conf_gt_0.9": 0.8260869565217391, "calib/gap": -0.002410714285714266, "calib/mean_conf": 0.947391304347826, "calib/mu_c": 0.9457142857142856, "calib/mu_w": 0.9481249999999999, "calib/nonempty_final_conf_rate": 0.08984375, "calib/nonempty_reasoning_rate": 0.14453125, "calib/nonempty_step_conf_rate": 0.10546875, "calib/pce": 0.6430434782608694, "calib/std_conf": 0.05317872991798486, "calib/step_conf_rate": 0.10546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 631.60546875, "completions/mean_terminated_length": 682.240478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.018133333333333335, "grad_norm": 0.1345282644033432, "learning_rate": 4.25e-06, "loss": -1.0562, "mask/has_final_conf_rate": 0.08984375, "mask/share_final_conf": 0.0014626241754740477, "mask/share_reasoning": 0.9170229434967041, "mask/share_step_conf": 0.00729574216529727, "num_tokens": 4587981.0, "reward": 0.10108289122581482, "reward_std": 0.13088199496269226, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.029129687696695328, "rewards/format_reward_step": 0.07421875, "rewards/stepwise_brier_reward": 0.05088218301534653, "step": 17 }, { "adv/mean_abs_final_conf": 0.14412738382816315, "adv/mean_abs_reasoning": 0.14765673875808716, "adv/mean_abs_step_conf": 0.14979150891304016, "adv/ratio_final_to_reasoning": 0.9760975695412973, "adv/ratio_step_to_reasoning": 1.0144576547803246, "adv/std_final_conf": 0.43377164006233215, "adv/std_reasoning": 0.43817782402038574, "adv/std_step_conf": 0.4377943277359009, "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.22916666666666666, "calib/avg_num_step_conf": 0.2109375, "calib/ece": 0.6718181818181819, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.01000000000000012, "calib/mean_conf": 0.922727272727273, "calib/mu_c": 0.93, "calib/mu_w": 0.9199999999999999, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.660909090909091, "calib/std_conf": 0.10514845860547571, "calib/step_conf_rate": 0.05078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 772.25390625, "completions/mean_terminated_length": 837.6991577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0192, "grad_norm": 0.13379941880702972, "learning_rate": 4.5e-06, "loss": -0.9989, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0005710519617423415, "mask/share_reasoning": 0.918830394744873, "mask/share_step_conf": 0.002473499160259962, "num_tokens": 4896398.0, "reward": 0.049791865050792694, "reward_std": 0.1002160832285881, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.015680858865380287, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.021430809050798416, "step": 18 }, { "adv/mean_abs_final_conf": 0.17932841181755066, "adv/mean_abs_reasoning": 0.19484557211399078, "adv/mean_abs_step_conf": 0.19124746322631836, "adv/ratio_final_to_reasoning": 0.9203617504463376, "adv/ratio_step_to_reasoning": 0.9815335352575145, "adv/std_final_conf": 0.4871034622192383, "adv/std_reasoning": 0.4967937469482422, "adv/std_step_conf": 0.4966728985309601, "calib/answer_extract_rate": 0.0859375, "calib/avg_num_step_conf": 0.28125, "calib/ece": 0.8526666666666669, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.8, "calib/mean_conf": 0.8526666666666667, "calib/mu_c": NaN, "calib/mu_w": 0.8526666666666667, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.8526666666666669, "calib/std_conf": 0.26541707221319094, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 555.2578125, "completions/mean_terminated_length": 646.1181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.020266666666666665, "grad_norm": 0.18200555443763733, "learning_rate": 4.75e-06, "loss": -1.137, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.0014445590786635876, "mask/share_reasoning": 0.8540744781494141, "mask/share_step_conf": 0.0038559352979063988, "num_tokens": 5143304.0, "reward": 0.0447770394384861, "reward_std": 0.09865927696228027, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.01148046925663948, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.03303259238600731, "step": 19 }, { "adv/mean_abs_final_conf": 0.28337258100509644, "adv/mean_abs_reasoning": 0.3037603497505188, "adv/mean_abs_step_conf": 0.306016206741333, "adv/ratio_final_to_reasoning": 0.932882060604134, "adv/ratio_step_to_reasoning": 1.0074264366388404, "adv/std_final_conf": 0.6145702004432678, "adv/std_reasoning": 0.6196979284286499, "adv/std_step_conf": 0.6194650530815125, "calib/answer_extract_rate": 0.14453125, "calib/auroc": 0.5138888888888888, "calib/avg_num_step_conf": 0.59375, "calib/ece": 0.6146153846153846, "calib/final_conf_rate": 0.1015625, "calib/format_rate": 0.0859375, "calib/frac_conf_gt_0.9": 0.8076923076923077, "calib/gap": 0.02736111111111106, "calib/mean_conf": 0.9223076923076923, "calib/mu_c": 0.9412499999999999, "calib/mu_w": 0.9138888888888889, "calib/nonempty_final_conf_rate": 0.1015625, "calib/nonempty_reasoning_rate": 0.16796875, "calib/nonempty_step_conf_rate": 0.1171875, "calib/pce": 0.6146153846153846, "calib/std_conf": 0.1132494758795035, "calib/step_conf_rate": 0.1171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 666.19140625, "completions/mean_terminated_length": 728.8248291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.021333333333333333, "grad_norm": 0.23702050745487213, "learning_rate": 5e-06, "loss": -1.8863, "mask/has_final_conf_rate": 0.1015625, "mask/share_final_conf": 0.0014831081498414278, "mask/share_reasoning": 0.9051233530044556, "mask/share_step_conf": 0.00745608052238822, "num_tokens": 5418721.0, "reward": 0.11435748636722565, "reward_std": 0.2429792582988739, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.04015117138624191, "rewards/format_reward_step": 0.0859375, "rewards/stepwise_brier_reward": 0.060201872140169144, "step": 20 }, { "adv/mean_abs_final_conf": 0.2655344605445862, "adv/mean_abs_reasoning": 0.27671802043914795, "adv/mean_abs_step_conf": 0.27363383769989014, "adv/ratio_final_to_reasoning": 0.9595849960302058, "adv/ratio_step_to_reasoning": 0.9888544203432676, "adv/std_final_conf": 0.5929623246192932, "adv/std_reasoning": 0.5971682071685791, "adv/std_step_conf": 0.5968631505966187, "calib/answer_extract_rate": 0.12109375, "calib/auroc": 0.7232142857142857, "calib/avg_num_step_conf": 0.4375, "calib/ece": 0.6221739130434784, "calib/final_conf_rate": 0.08984375, "calib/format_rate": 0.078125, "calib/frac_conf_gt_0.9": 0.6956521739130435, "calib/gap": 0.031696428571428736, "calib/mean_conf": 0.9265217391304347, "calib/mu_c": 0.9485714285714286, "calib/mu_w": 0.9168749999999999, "calib/nonempty_final_conf_rate": 0.08984375, "calib/nonempty_reasoning_rate": 0.14453125, "calib/nonempty_step_conf_rate": 0.109375, "calib/pce": 0.6221739130434784, "calib/std_conf": 0.06210850922952952, "calib/step_conf_rate": 0.109375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 637.85546875, "completions/mean_terminated_length": 686.0966796875, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0224, "grad_norm": 0.16225571930408478, "learning_rate": 4.9722222222222224e-06, "loss": -1.432, "mask/has_final_conf_rate": 0.08984375, "mask/share_final_conf": 0.0013876496814191341, "mask/share_reasoning": 0.9224339723587036, "mask/share_step_conf": 0.005865876562893391, "num_tokens": 5684972.0, "reward": 0.10086153447628021, "reward_std": 0.22441299259662628, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.03598632663488388, "rewards/format_reward_step": 0.078125, "rewards/stepwise_brier_reward": 0.05091741681098938, "step": 21 }, { "adv/mean_abs_final_conf": 0.21784275770187378, "adv/mean_abs_reasoning": 0.23352275788784027, "adv/mean_abs_step_conf": 0.23176658153533936, "adv/ratio_final_to_reasoning": 0.9328545092230475, "adv/ratio_step_to_reasoning": 0.992479635096874, "adv/std_final_conf": 0.5453119874000549, "adv/std_reasoning": 0.549296498298645, "adv/std_step_conf": 0.5490684509277344, "calib/answer_extract_rate": 0.10546875, "calib/auroc": 0.6166666666666667, "calib/avg_num_step_conf": 0.57421875, "calib/ece": 0.6299999999999999, "calib/final_conf_rate": 0.07421875, "calib/format_rate": 0.05859375, "calib/frac_conf_gt_0.9": 0.631578947368421, "calib/gap": 0.13549999999999995, "calib/mean_conf": 0.8405263157894737, "calib/mu_c": 0.9474999999999999, "calib/mu_w": 0.8119999999999999, "calib/nonempty_final_conf_rate": 0.07421875, "calib/nonempty_reasoning_rate": 0.140625, "calib/nonempty_step_conf_rate": 0.1015625, "calib/pce": 0.6299999999999999, "calib/std_conf": 0.2554144966460457, "calib/step_conf_rate": 0.1015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 626.9453125, "completions/mean_terminated_length": 682.97021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.023466666666666667, "grad_norm": 0.24337080121040344, "learning_rate": 4.944444444444445e-06, "loss": -1.2015, "mask/has_final_conf_rate": 0.07421875, "mask/share_final_conf": 0.0015047264751046896, "mask/share_reasoning": 0.9101567268371582, "mask/share_step_conf": 0.0063072992488741875, "num_tokens": 5947286.0, "reward": 0.07395316660404205, "reward_std": 0.18006539344787598, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.029487499967217445, "rewards/format_reward_step": 0.05859375, "rewards/stepwise_brier_reward": 0.043318841606378555, "step": 22 }, { "adv/mean_abs_final_conf": 0.17735733091831207, "adv/mean_abs_reasoning": 0.18752095103263855, "adv/mean_abs_step_conf": 0.1866583526134491, "adv/ratio_final_to_reasoning": 0.9458000822928982, "adv/ratio_step_to_reasoning": 0.9953999890975419, "adv/std_final_conf": 0.4912124276161194, "adv/std_reasoning": 0.4968550503253937, "adv/std_step_conf": 0.4964953660964966, "calib/answer_extract_rate": 0.09375, "calib/auroc": 0.4375, "calib/avg_num_step_conf": 0.3046875, "calib/ece": 0.7299999999999999, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.8888888888888888, "calib/gap": -0.0028571428571428914, "calib/mean_conf": 0.9522222222222223, "calib/mu_c": 0.95, "calib/mu_w": 0.9528571428571428, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.10546875, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.7299999999999999, "calib/std_conf": 0.04429140317332165, "calib/step_conf_rate": 0.0703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 617.796875, "completions/mean_terminated_length": 675.88037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.024533333333333334, "grad_norm": 0.13091301918029785, "learning_rate": 4.9166666666666665e-06, "loss": -1.0606, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.0017373452428728342, "mask/share_reasoning": 0.9078471660614014, "mask/share_step_conf": 0.004478019662201405, "num_tokens": 6209378.0, "reward": 0.05653750151395798, "reward_std": 0.12422418594360352, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.014636717736721039, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.023725392296910286, "step": 23 }, { "adv/mean_abs_final_conf": 0.19051101803779602, "adv/mean_abs_reasoning": 0.20617303252220154, "adv/mean_abs_step_conf": 0.20002737641334534, "adv/ratio_final_to_reasoning": 0.9240346116424369, "adv/ratio_step_to_reasoning": 0.9701917557612952, "adv/std_final_conf": 0.4938734769821167, "adv/std_reasoning": 0.49688783288002014, "adv/std_step_conf": 0.496569961309433, "calib/answer_extract_rate": 0.12890625, "calib/auroc": 0.5164835164835164, "calib/avg_num_step_conf": 0.71484375, "calib/ece": 0.5834999999999997, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.07421875, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.03417582417582432, "calib/mean_conf": 0.9334999999999999, "calib/mu_c": 0.9557142857142857, "calib/mu_w": 0.9215384615384614, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.1640625, "calib/nonempty_step_conf_rate": 0.125, "calib/pce": 0.5834999999999997, "calib/std_conf": 0.08193137372215849, "calib/step_conf_rate": 0.125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 726.30859375, "completions/mean_terminated_length": 791.2127685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0256, "grad_norm": 0.13573439419269562, "learning_rate": 4.888888888888889e-06, "loss": -0.9367, "mask/has_final_conf_rate": 0.078125, "mask/share_final_conf": 0.0010043885558843613, "mask/share_reasoning": 0.9085052609443665, "mask/share_step_conf": 0.008459066040813923, "num_tokens": 6499825.0, "reward": 0.09818961471319199, "reward_std": 0.15720906853675842, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.034287892282009125, "rewards/format_reward_step": 0.07421875, "rewards/stepwise_brier_reward": 0.04251652956008911, "step": 24 }, { "adv/mean_abs_final_conf": 0.2048564851284027, "adv/mean_abs_reasoning": 0.2124483287334442, "adv/mean_abs_step_conf": 0.21995320916175842, "adv/ratio_final_to_reasoning": 0.9642649878664525, "adv/ratio_step_to_reasoning": 1.0353256741206491, "adv/std_final_conf": 0.5154340863227844, "adv/std_reasoning": 0.5237152576446533, "adv/std_step_conf": 0.523485541343689, "calib/answer_extract_rate": 0.10546875, "calib/auroc": 0.661764705882353, "calib/avg_num_step_conf": 0.578125, "calib/ece": 0.7214285714285712, "calib/final_conf_rate": 0.08203125, "calib/format_rate": 0.0703125, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.06867647058823534, "calib/mean_conf": 0.9119047619047619, "calib/mu_c": 0.9675, "calib/mu_w": 0.8988235294117647, "calib/nonempty_final_conf_rate": 0.08203125, "calib/nonempty_reasoning_rate": 0.14453125, "calib/nonempty_step_conf_rate": 0.12109375, "calib/pce": 0.7214285714285712, "calib/std_conf": 0.1858985654705031, "calib/step_conf_rate": 0.12109375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 589.234375, "completions/mean_terminated_length": 641.8893432617188, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.02666666666666667, "grad_norm": 0.14066976308822632, "learning_rate": 4.861111111111111e-06, "loss": -1.2565, "mask/has_final_conf_rate": 0.08203125, "mask/share_final_conf": 0.001033964566886425, "mask/share_reasoning": 0.9100552201271057, "mask/share_step_conf": 0.0068795569241046906, "num_tokens": 6753893.0, "reward": 0.08146456629037857, "reward_std": 0.14879143238067627, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.021036718040704727, "rewards/format_reward_step": 0.0703125, "rewards/stepwise_brier_reward": 0.05084826797246933, "step": 25 }, { "adv/mean_abs_final_conf": 0.1946616768836975, "adv/mean_abs_reasoning": 0.22415024042129517, "adv/mean_abs_step_conf": 0.22112242877483368, "adv/ratio_final_to_reasoning": 0.8684428645618525, "adv/ratio_step_to_reasoning": 0.9864920437258035, "adv/std_final_conf": 0.520853579044342, "adv/std_reasoning": 0.5492832064628601, "adv/std_step_conf": 0.5490449070930481, "calib/answer_extract_rate": 0.11328125, "calib/auroc": 0.6428571428571428, "calib/avg_num_step_conf": 0.83203125, "calib/ece": 0.6833333333333332, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.0546875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.0635714285714285, "calib/mean_conf": 0.9055555555555556, "calib/mu_c": 0.955, "calib/mu_w": 0.8914285714285715, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.1484375, "calib/nonempty_step_conf_rate": 0.10546875, "calib/pce": 0.6833333333333332, "calib/std_conf": 0.13136980129146128, "calib/step_conf_rate": 0.10546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 614.3125, "completions/mean_terminated_length": 674.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.027733333333333332, "grad_norm": 0.2409486472606659, "learning_rate": 4.833333333333333e-06, "loss": -1.2538, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.00079687888501212, "mask/share_reasoning": 0.903478741645813, "mask/share_step_conf": 0.005880659446120262, "num_tokens": 7016397.0, "reward": 0.06622931361198425, "reward_std": 0.1676761656999588, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.023710157722234726, "rewards/format_reward_step": 0.0546875, "rewards/stepwise_brier_reward": 0.034666046500205994, "step": 26 }, { "adv/mean_abs_final_conf": 0.17820599675178528, "adv/mean_abs_reasoning": 0.2028244286775589, "adv/mean_abs_step_conf": 0.1893235743045807, "adv/ratio_final_to_reasoning": 0.8786219584776404, "adv/ratio_step_to_reasoning": 0.9334357579064538, "adv/std_final_conf": 0.4923804700374603, "adv/std_reasoning": 0.5237365365028381, "adv/std_step_conf": 0.4966087341308594, "calib/answer_extract_rate": 0.10546875, "calib/auroc": 0.6, "calib/avg_num_step_conf": 0.3125, "calib/ece": 0.6275000000000001, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.02036363636363636, "calib/mean_conf": 0.94, "calib/mu_c": 0.954, "calib/mu_w": 0.9336363636363636, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.11328125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.6275000000000001, "calib/std_conf": 0.04703721930556695, "calib/step_conf_rate": 0.078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 603.59765625, "completions/mean_terminated_length": 654.75, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0288, "grad_norm": 0.1442146897315979, "learning_rate": 4.805555555555556e-06, "loss": -1.0975, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.0009431581711396575, "mask/share_reasoning": 0.9170562028884888, "mask/share_step_conf": 0.003875626251101494, "num_tokens": 7276134.0, "reward": 0.0711732804775238, "reward_std": 0.15970629453659058, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.02380312606692314, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.03278874605894089, "step": 27 }, { "adv/mean_abs_final_conf": 0.11494233459234238, "adv/mean_abs_reasoning": 0.11595869064331055, "adv/mean_abs_step_conf": 0.11588902771472931, "adv/ratio_final_to_reasoning": 0.9912351886233824, "adv/ratio_step_to_reasoning": 0.9993992435737695, "adv/std_final_conf": 0.40212392807006836, "adv/std_reasoning": 0.4056612551212311, "adv/std_step_conf": 0.4054175913333893, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.1875, "calib/ece": 0.5733333333333334, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.03499999999999992, "calib/mean_conf": 0.9066666666666667, "calib/mu_c": 0.9299999999999999, "calib/mu_w": 0.895, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.5733333333333334, "calib/std_conf": 0.09551032521262937, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 747.94921875, "completions/mean_terminated_length": 843.502197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.029866666666666666, "grad_norm": 0.13489677011966705, "learning_rate": 4.777777777777778e-06, "loss": -0.8107, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.00030747134587727487, "mask/share_reasoning": 0.8835909962654114, "mask/share_step_conf": 0.00282029015943408, "num_tokens": 7574553.0, "reward": 0.02977309562265873, "reward_std": 0.08421102911233902, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010675780475139618, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.015145798213779926, "step": 28 }, { "adv/mean_abs_final_conf": 0.13464173674583435, "adv/mean_abs_reasoning": 0.1608918309211731, "adv/mean_abs_step_conf": 0.16683977842330933, "adv/ratio_final_to_reasoning": 0.8368463207544723, "adv/ratio_step_to_reasoning": 1.0369686109486214, "adv/std_final_conf": 0.4364698529243469, "adv/std_reasoning": 0.46844682097435, "adv/std_step_conf": 0.46820372343063354, "calib/answer_extract_rate": 0.08203125, "calib/auroc": 0.513888888888889, "calib/avg_num_step_conf": 0.3359375, "calib/ece": 0.6437692307692308, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.04296875, "calib/frac_conf_gt_0.9": 0.9230769230769231, "calib/gap": 0.012333333333333307, "calib/mean_conf": 0.9514615384615385, "calib/mu_c": 0.96, "calib/mu_w": 0.9476666666666667, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.6437692307692308, "calib/std_conf": 0.04967063113785786, "calib/step_conf_rate": 0.06640625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 692.18359375, "completions/mean_terminated_length": 767.09521484375, "completions/min_length": 0.0, "completions/min_terminated_length": 7.0, "epoch": 0.030933333333333334, "grad_norm": 0.21704789996147156, "learning_rate": 4.75e-06, "loss": -0.8478, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0006637288024649024, "mask/share_reasoning": 0.8980405926704407, "mask/share_step_conf": 0.0036394214257597923, "num_tokens": 7858880.0, "reward": 0.05586779862642288, "reward_std": 0.1327630877494812, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.018447261303663254, "rewards/format_reward_step": 0.04296875, "rewards/stepwise_brier_reward": 0.028293216601014137, "step": 29 }, { "adv/mean_abs_final_conf": 0.0374080091714859, "adv/mean_abs_reasoning": 0.07730154693126678, "adv/mean_abs_step_conf": 0.05792953446507454, "adv/ratio_final_to_reasoning": 0.4839231639794673, "adv/ratio_step_to_reasoning": 0.7493968331136115, "adv/std_final_conf": 0.22671973705291748, "adv/std_reasoning": 0.33120280504226685, "adv/std_step_conf": 0.28659942746162415, "calib/answer_extract_rate": 0.0625, "calib/auroc": 0.3888888888888889, "calib/avg_num_step_conf": 0.29296875, "calib/ece": 0.823, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.03000000000000025, "calib/mean_conf": 0.9229999999999998, "calib/mu_c": 0.95, "calib/mu_w": 0.9199999999999997, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.823, "calib/std_conf": 0.11550324670761418, "calib/step_conf_rate": 0.05078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 707.390625, "completions/mean_terminated_length": 780.5689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.032, "grad_norm": 0.09119506180286407, "learning_rate": 4.722222222222222e-06, "loss": -0.263, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.001026460900902748, "mask/share_reasoning": 0.9018102884292603, "mask/share_step_conf": 0.0034132516011595726, "num_tokens": 8146956.0, "reward": 0.012917187064886093, "reward_std": 0.03653532266616821, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0004585937422234565, "rewards/format_reward_step": 0.01171875, "rewards/stepwise_brier_reward": 0.006073827389627695, "step": 30 }, { "adv/mean_abs_final_conf": 0.10018590092658997, "adv/mean_abs_reasoning": 0.10260053724050522, "adv/mean_abs_step_conf": 0.10107364505529404, "adv/ratio_final_to_reasoning": 0.9764656562347708, "adv/ratio_step_to_reasoning": 0.9851180878163239, "adv/std_final_conf": 0.3672908544540405, "adv/std_reasoning": 0.3702797293663025, "adv/std_step_conf": 0.3699702024459839, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.10546875, "calib/ece": 0.9275, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.875, "calib/mean_conf": 0.9275, "calib/mu_c": NaN, "calib/mu_w": 0.9275, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.9275, "calib/std_conf": 0.050682837331783206, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 728.1015625, "completions/mean_terminated_length": 813.9476318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.03306666666666667, "grad_norm": 0.10365401953458786, "learning_rate": 4.694444444444445e-06, "loss": -0.7181, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.001135041005909443, "mask/share_reasoning": 0.8914518356323242, "mask/share_step_conf": 0.0019443880300968885, "num_tokens": 8439262.0, "reward": 0.0171560849994421, "reward_std": 0.0432112030684948, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0035996094811707735, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.009074865840375423, "step": 31 }, { "adv/mean_abs_final_conf": 0.1154562383890152, "adv/mean_abs_reasoning": 0.11964812874794006, "adv/mean_abs_step_conf": 0.11882726103067398, "adv/ratio_final_to_reasoning": 0.9649648481527378, "adv/ratio_step_to_reasoning": 0.9931393183841982, "adv/std_final_conf": 0.4041202664375305, "adv/std_reasoning": 0.40568676590919495, "adv/std_step_conf": 0.40552806854248047, "calib/answer_extract_rate": 0.0625, "calib/auroc": 0.25925925925925924, "calib/avg_num_step_conf": 0.14453125, "calib/ece": 0.6475000000000001, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": 0.07444444444444431, "calib/mean_conf": 0.8975, "calib/mu_c": 0.9533333333333333, "calib/mu_w": 0.8788888888888889, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.6475000000000001, "calib/std_conf": 0.18046814123273947, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 669.89453125, "completions/mean_terminated_length": 742.3939208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 7.0, "epoch": 0.034133333333333335, "grad_norm": 0.20890480279922485, "learning_rate": 4.666666666666667e-06, "loss": -0.7214, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0010919722262769938, "mask/share_reasoning": 0.8988701701164246, "mask/share_step_conf": 0.0023815971799194813, "num_tokens": 8717459.0, "reward": 0.041473038494586945, "reward_std": 0.10357101261615753, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.017754295840859413, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.01938142627477646, "step": 32 }, { "adv/mean_abs_final_conf": 0.14984360337257385, "adv/mean_abs_reasoning": 0.19187571108341217, "adv/mean_abs_step_conf": 0.1522672474384308, "adv/ratio_final_to_reasoning": 0.7809409670796419, "adv/ratio_step_to_reasoning": 0.7935722899926463, "adv/std_final_conf": 0.4321545660495758, "adv/std_reasoning": 0.49684324860572815, "adv/std_step_conf": 0.4377925992012024, "calib/answer_extract_rate": 0.0859375, "calib/auroc": 0.6153846153846154, "calib/avg_num_step_conf": 0.2109375, "calib/ece": 0.8000000000000002, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.03076923076923055, "calib/mean_conf": 0.9333333333333333, "calib/mu_c": 0.96, "calib/mu_w": 0.9292307692307694, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.8000000000000002, "calib/std_conf": 0.057696524062450134, "calib/step_conf_rate": 0.046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 724.015625, "completions/mean_terminated_length": 782.0590209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.0352, "grad_norm": 0.13100853562355042, "learning_rate": 4.638888888888889e-06, "loss": -1.0953, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.0013625889550894499, "mask/share_reasoning": 0.9213559627532959, "mask/share_step_conf": 0.0030626384541392326, "num_tokens": 9009679.0, "reward": 0.04265952855348587, "reward_std": 0.10281442105770111, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.008678124286234379, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.01847999542951584, "step": 33 }, { "adv/mean_abs_final_conf": 0.19780707359313965, "adv/mean_abs_reasoning": 0.20779672265052795, "adv/mean_abs_step_conf": 0.20860755443572998, "adv/ratio_final_to_reasoning": 0.9519258584545202, "adv/ratio_step_to_reasoning": 1.0039020431836438, "adv/std_final_conf": 0.5196132063865662, "adv/std_reasoning": 0.5236744284629822, "adv/std_step_conf": 0.5234167575836182, "calib/answer_extract_rate": 0.09375, "calib/auroc": 0.5294117647058824, "calib/avg_num_step_conf": 0.265625, "calib/ece": 0.7855555555555556, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.07294117647058829, "calib/mean_conf": 0.8411111111111111, "calib/mu_c": 0.91, "calib/mu_w": 0.8370588235294117, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.11328125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.7855555555555556, "calib/std_conf": 0.21931430740400581, "calib/step_conf_rate": 0.078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 643.7578125, "completions/mean_terminated_length": 739.0224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.03626666666666667, "grad_norm": 0.12947340309619904, "learning_rate": 4.611111111111112e-06, "loss": -0.9669, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.001347921323031187, "mask/share_reasoning": 0.865627110004425, "mask/share_step_conf": 0.004118745215237141, "num_tokens": 9279593.0, "reward": 0.048354651778936386, "reward_std": 0.11566583812236786, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.015407422557473183, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.030411846935749054, "step": 34 }, { "adv/mean_abs_final_conf": 0.10604438185691833, "adv/mean_abs_reasoning": 0.14723995327949524, "adv/mean_abs_step_conf": 0.10678407549858093, "adv/ratio_final_to_reasoning": 0.7202147209027006, "adv/ratio_step_to_reasoning": 0.72523845002776, "adv/std_final_conf": 0.36649808287620544, "adv/std_reasoning": 0.43816131353378296, "adv/std_step_conf": 0.3700529932975769, "calib/answer_extract_rate": 0.06640625, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.22265625, "calib/ece": 0.8400000000000001, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.05555555555555558, "calib/mean_conf": 0.9400000000000002, "calib/mu_c": 0.99, "calib/mu_w": 0.9344444444444444, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.8400000000000001, "calib/std_conf": 0.051185935568278884, "calib/step_conf_rate": 0.046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 760.9296875, "completions/mean_terminated_length": 832.4701538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.037333333333333336, "grad_norm": 0.12047702819108963, "learning_rate": 4.583333333333333e-06, "loss": -0.4872, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.000588306924328208, "mask/share_reasoning": 0.9106691479682922, "mask/share_step_conf": 0.002805058378726244, "num_tokens": 9583647.0, "reward": 0.028987498953938484, "reward_std": 0.07016594707965851, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0037542972713708878, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.01312909834086895, "step": 35 }, { "adv/mean_abs_final_conf": 0.15778377652168274, "adv/mean_abs_reasoning": 0.1813150942325592, "adv/mean_abs_step_conf": 0.1688225269317627, "adv/ratio_final_to_reasoning": 0.870218649966921, "adv/ratio_step_to_reasoning": 0.9311002354565515, "adv/std_final_conf": 0.4654620587825775, "adv/std_reasoning": 0.49686798453330994, "adv/std_step_conf": 0.4681243300437927, "calib/answer_extract_rate": 0.0859375, "calib/auroc": 0.4363636363636364, "calib/avg_num_step_conf": 0.1875, "calib/ece": 0.5506250000000001, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.5625, "calib/gap": 0.04127272727272713, "calib/mean_conf": 0.775625, "calib/mu_c": 0.8039999999999999, "calib/mu_w": 0.7627272727272728, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.5068750000000002, "calib/std_conf": 0.32256721683239914, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 625.63671875, "completions/mean_terminated_length": 672.9537963867188, "completions/min_length": 0.0, "completions/min_terminated_length": 18.0, "epoch": 0.0384, "grad_norm": 0.12413109093904495, "learning_rate": 4.555555555555556e-06, "loss": -1.0095, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.0008500913972966373, "mask/share_reasoning": 0.9263497591018677, "mask/share_step_conf": 0.0024876415263861418, "num_tokens": 9846522.0, "reward": 0.06386526674032211, "reward_std": 0.1407494843006134, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.02153554931282997, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.031025253236293793, "step": 36 }, { "adv/mean_abs_final_conf": 0.07668406516313553, "adv/mean_abs_reasoning": 0.10092823207378387, "adv/mean_abs_step_conf": 0.07726116478443146, "adv/ratio_final_to_reasoning": 0.7597880552101163, "adv/ratio_step_to_reasoning": 0.765505975849745, "adv/std_final_conf": 0.3285629451274872, "adv/std_reasoning": 0.3703406751155853, "adv/std_step_conf": 0.33102983236312866, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.16666666666666669, "calib/avg_num_step_conf": 0.08203125, "calib/ece": 0.7225, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.625, "calib/gap": -0.35000000000000003, "calib/mean_conf": 0.7225, "calib/mu_c": 0.46, "calib/mu_w": 0.81, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.5975, "calib/std_conf": 0.3910482706776748, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 597.72265625, "completions/mean_terminated_length": 695.5317993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.039466666666666664, "grad_norm": 0.09366196393966675, "learning_rate": 4.527777777777778e-06, "loss": -0.563, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.00032904170802794397, "mask/share_reasoning": 0.8579970598220825, "mask/share_step_conf": 0.00104893883690238, "num_tokens": 10106635.0, "reward": 0.025437016040086746, "reward_std": 0.06532414257526398, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.005510937422513962, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.009056063368916512, "step": 37 }, { "adv/mean_abs_final_conf": 0.0598284974694252, "adv/mean_abs_reasoning": 0.10520166158676147, "adv/mean_abs_step_conf": 0.08477747440338135, "adv/ratio_final_to_reasoning": 0.5687029707233635, "adv/ratio_step_to_reasoning": 0.8058568004029483, "adv/std_final_conf": 0.28494229912757874, "adv/std_reasoning": 0.3703286945819855, "adv/std_step_conf": 0.3308514952659607, "calib/answer_extract_rate": 0.046875, "calib/auroc": 0.29166666666666663, "calib/avg_num_step_conf": 0.17578125, "calib/ece": 0.7075, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": -0.0033333333333334103, "calib/mean_conf": 0.9575, "calib/mu_c": 0.955, "calib/mu_w": 0.9583333333333334, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.7075, "calib/std_conf": 0.02586020108197149, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 744.63671875, "completions/mean_terminated_length": 811.1787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.04053333333333333, "grad_norm": 0.0911625474691391, "learning_rate": 4.5e-06, "loss": -0.5374, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0005922014825046062, "mask/share_reasoning": 0.9159488677978516, "mask/share_step_conf": 0.0014276672154664993, "num_tokens": 10404150.0, "reward": 0.025644494220614433, "reward_std": 0.05945397913455963, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.005485547240823507, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.00948371458798647, "step": 38 }, { "adv/mean_abs_final_conf": 0.07660110294818878, "adv/mean_abs_reasoning": 0.08327651768922806, "adv/mean_abs_step_conf": 0.08296199142932892, "adv/ratio_final_to_reasoning": 0.9198403712562689, "adv/ratio_step_to_reasoning": 0.9962231098438471, "adv/std_final_conf": 0.32866036891937256, "adv/std_reasoning": 0.33119046688079834, "adv/std_step_conf": 0.3309943675994873, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.1328125, "calib/ece": 0.758, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.6, "calib/mean_conf": 0.758, "calib/mu_c": NaN, "calib/mu_w": 0.758, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.758, "calib/std_conf": 0.3121794355815258, "calib/step_conf_rate": 0.03125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 656.76953125, "completions/mean_terminated_length": 697.6473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0416, "grad_norm": 0.0807473286986351, "learning_rate": 4.472222222222223e-06, "loss": -0.4533, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.00039599655428901315, "mask/share_reasoning": 0.9395262002944946, "mask/share_step_conf": 0.0014840353978797793, "num_tokens": 10678371.0, "reward": 0.016320213675498962, "reward_std": 0.03991464152932167, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.005438671912997961, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.010389840230345726, "step": 39 }, { "adv/mean_abs_final_conf": 0.02359767258167267, "adv/mean_abs_reasoning": 0.025304459035396576, "adv/mean_abs_step_conf": 0.0234784297645092, "adv/ratio_final_to_reasoning": 0.932549972661482, "adv/ratio_step_to_reasoning": 0.9278376483633547, "adv/std_final_conf": 0.16475501656532288, "adv/std_reasoning": 0.16561181843280792, "adv/std_step_conf": 0.1655532270669937, "calib/answer_extract_rate": 0.0390625, "calib/avg_num_step_conf": 0.015625, "calib/ece": 0.6166666666666667, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.6166666666666667, "calib/mu_c": NaN, "calib/mu_w": 0.6166666666666667, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.6166666666666667, "calib/std_conf": 0.4365266951236265, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 686.71484375, "completions/mean_terminated_length": 767.6812133789062, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.042666666666666665, "grad_norm": 0.1018519401550293, "learning_rate": 4.444444444444444e-06, "loss": -0.1728, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0006191778229549527, "mask/share_reasoning": 0.8933592438697815, "mask/share_step_conf": 0.0005528143374249339, "num_tokens": 10960930.0, "reward": 0.0061796484515070915, "reward_std": 0.011564082466065884, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0011230468517169356, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.003985273651778698, "step": 40 }, { "adv/mean_abs_final_conf": 0.07689839601516724, "adv/mean_abs_reasoning": 0.10181869566440582, "adv/mean_abs_step_conf": 0.10532698780298233, "adv/ratio_final_to_reasoning": 0.755248292205826, "adv/ratio_step_to_reasoning": 1.0344562667561548, "adv/std_final_conf": 0.3298797309398651, "adv/std_reasoning": 0.3703538179397583, "adv/std_step_conf": 0.370156466960907, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.19444444444444442, "calib/avg_num_step_conf": 0.3515625, "calib/ece": 0.6922222222222222, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.7777777777777778, "calib/gap": -0.25, "calib/mean_conf": 0.8033333333333332, "calib/mu_c": 0.6366666666666666, "calib/mu_w": 0.8866666666666666, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.5811111111111111, "calib/std_conf": 0.31843366656181316, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 634.71484375, "completions/mean_terminated_length": 725.388427734375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.04373333333333333, "grad_norm": 0.05635469779372215, "learning_rate": 4.416666666666667e-06, "loss": -0.4841, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0004555766936391592, "mask/share_reasoning": 0.8720616102218628, "mask/share_step_conf": 0.0024828272871673107, "num_tokens": 11230665.0, "reward": 0.03794757276773453, "reward_std": 0.09024789929389954, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.011569530703127384, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.01932913064956665, "step": 41 }, { "adv/mean_abs_final_conf": 0.10751166194677353, "adv/mean_abs_reasoning": 0.10858826339244843, "adv/mean_abs_step_conf": 0.10843968391418457, "adv/ratio_final_to_reasoning": 0.9900854713756315, "adv/ratio_step_to_reasoning": 0.9986317169680956, "adv/std_final_conf": 0.3666759729385376, "adv/std_reasoning": 0.3703175485134125, "adv/std_step_conf": 0.3700626790523529, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.18359375, "calib/ece": 0.8477777777777777, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.02124999999999999, "calib/mean_conf": 0.9588888888888888, "calib/mu_c": 0.94, "calib/mu_w": 0.9612499999999999, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.8477777777777777, "calib/std_conf": 0.007370277311900895, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 557.27734375, "completions/mean_terminated_length": 622.9825439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0448, "grad_norm": 0.0798545628786087, "learning_rate": 4.388888888888889e-06, "loss": -0.6422, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0004774858825840056, "mask/share_reasoning": 0.8922837972640991, "mask/share_step_conf": 0.0017699991585686803, "num_tokens": 11477696.0, "reward": 0.025472892448306084, "reward_std": 0.061406031250953674, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.005729687865823507, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.012924695387482643, "step": 42 }, { "adv/mean_abs_final_conf": 0.10083452612161636, "adv/mean_abs_reasoning": 0.10260053724050522, "adv/mean_abs_step_conf": 0.10188859701156616, "adv/ratio_final_to_reasoning": 0.9827875061243669, "adv/ratio_step_to_reasoning": 0.993061047748023, "adv/std_final_conf": 0.36501818895339966, "adv/std_reasoning": 0.3702797293663025, "adv/std_step_conf": 0.3699541687965393, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 0.10546875, "calib/ece": 0.9424999999999999, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.875, "calib/mean_conf": 0.9424999999999999, "calib/mu_c": NaN, "calib/mu_w": 0.9424999999999999, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.9424999999999999, "calib/std_conf": 0.05402545696243574, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 674.1484375, "completions/mean_terminated_length": 750.3565063476562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.04586666666666667, "grad_norm": 0.11436983942985535, "learning_rate": 4.361111111111112e-06, "loss": -0.8247, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0002926693414337933, "mask/share_reasoning": 0.8972545862197876, "mask/share_step_conf": 0.0008902625413611531, "num_tokens": 11755502.0, "reward": 0.016670772805809975, "reward_std": 0.04176222160458565, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0016867187805473804, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.009060687385499477, "step": 43 }, { "adv/mean_abs_final_conf": 0.08186276257038116, "adv/mean_abs_reasoning": 0.08327651768922806, "adv/mean_abs_step_conf": 0.08257116377353668, "adv/ratio_final_to_reasoning": 0.9830233641118046, "adv/ratio_step_to_reasoning": 0.991529978254811, "adv/std_final_conf": 0.32681629061698914, "adv/std_reasoning": 0.33119046688079834, "adv/std_step_conf": 0.33087316155433655, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.09765625, "calib/ece": 0.96, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.96, "calib/std_conf": 0.005773502691896263, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 726.59375, "completions/mean_terminated_length": 812.2620239257812, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.046933333333333334, "grad_norm": 0.1150890588760376, "learning_rate": 4.333333333333334e-06, "loss": -0.5969, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0002706256345845759, "mask/share_reasoning": 0.8929933905601501, "mask/share_step_conf": 0.0012672271113842726, "num_tokens": 12047830.0, "reward": 0.014166897162795067, "reward_std": 0.034422408789396286, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0014558595139533281, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.008074614219367504, "step": 44 }, { "adv/mean_abs_final_conf": 0.0662812888622284, "adv/mean_abs_reasoning": 0.08627735078334808, "adv/mean_abs_step_conf": 0.0632265955209732, "adv/ratio_final_to_reasoning": 0.7682350960064595, "adv/ratio_step_to_reasoning": 0.7328295890742188, "adv/std_final_conf": 0.2857547700405121, "adv/std_reasoning": 0.3312488794326782, "adv/std_step_conf": 0.28668659925460815, "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.8333333333333334, "calib/avg_num_step_conf": 0.08984375, "calib/ece": 0.729, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.15666666666666662, "calib/mean_conf": 0.8290000000000001, "calib/mu_c": 0.97, "calib/mu_w": 0.8133333333333334, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.729, "calib/std_conf": 0.28605768648998053, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 719.265625, "completions/mean_terminated_length": 770.4267578125, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.048, "grad_norm": 0.09593694657087326, "learning_rate": 4.305555555555556e-06, "loss": -0.5089, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0005101782153360546, "mask/share_reasoning": 0.9324560761451721, "mask/share_step_conf": 0.0006274882471188903, "num_tokens": 12337010.0, "reward": 0.02451431378722191, "reward_std": 0.057681601494550705, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008652344346046448, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.009546207264065742, "step": 45 }, { "adv/mean_abs_final_conf": 0.058217599987983704, "adv/mean_abs_reasoning": 0.06654815375804901, "adv/mean_abs_step_conf": 0.06974774599075317, "adv/ratio_final_to_reasoning": 0.8748191602677223, "adv/ratio_step_to_reasoning": 1.0480793538516036, "adv/std_final_conf": 0.28529322147369385, "adv/std_reasoning": 0.28685876727104187, "adv/std_step_conf": 0.2865889370441437, "calib/answer_extract_rate": 0.03125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.07421875, "calib/ece": 0.6372222222222221, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.19933333333333336, "calib/mean_conf": 0.8038888888888889, "calib/mu_c": 0.97, "calib/mu_w": 0.7706666666666666, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.6372222222222221, "calib/std_conf": 0.23014018808726022, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 656.640625, "completions/mean_terminated_length": 734.0611572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.04906666666666667, "grad_norm": 0.08373303711414337, "learning_rate": 4.277777777777778e-06, "loss": -0.481, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0002997597330249846, "mask/share_reasoning": 0.8933658599853516, "mask/share_step_conf": 0.0008656186982989311, "num_tokens": 12609878.0, "reward": 0.019132791087031364, "reward_std": 0.040449466556310654, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.007249609567224979, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.0072970278561115265, "step": 46 }, { "adv/mean_abs_final_conf": 0.07674593478441238, "adv/mean_abs_reasoning": 0.07731065154075623, "adv/mean_abs_step_conf": 0.07728387415409088, "adv/ratio_final_to_reasoning": 0.9926954857436929, "adv/ratio_step_to_reasoning": 0.9996536391023528, "adv/std_final_conf": 0.32882946729660034, "adv/std_reasoning": 0.3312418460845947, "adv/std_step_conf": 0.3311271071434021, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.078125, "calib/ece": 0.6266666666666667, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.96, "calib/mu_c": 0.96, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.6266666666666667, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 710.0, "completions/mean_terminated_length": 793.7117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 37.0, "epoch": 0.050133333333333335, "grad_norm": 0.09604458510875702, "learning_rate": 4.25e-06, "loss": -0.5308, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.00019567902199923992, "mask/share_reasoning": 0.8936618566513062, "mask/share_step_conf": 0.0006737294606864452, "num_tokens": 12897614.0, "reward": 0.023951200768351555, "reward_std": 0.06774422526359558, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008412499912083149, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.012446149252355099, "step": 47 }, { "adv/mean_abs_final_conf": 0.08069179952144623, "adv/mean_abs_reasoning": 0.10261328518390656, "adv/mean_abs_step_conf": 0.08180000633001328, "adv/ratio_final_to_reasoning": 0.7863679578801908, "adv/ratio_step_to_reasoning": 0.7971677954117626, "adv/std_final_conf": 0.32742586731910706, "adv/std_reasoning": 0.3703286051750183, "adv/std_step_conf": 0.33102738857269287, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.2265625, "calib/ece": 0.7883333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": -0.06599999999999995, "calib/mean_conf": 0.955, "calib/mu_c": 0.9, "calib/mu_w": 0.966, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.7883333333333333, "calib/std_conf": 0.025658007197234402, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 689.80859375, "completions/mean_terminated_length": 767.7869262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0512, "grad_norm": 0.052539192140102386, "learning_rate": 4.222222222222223e-06, "loss": -0.5068, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0003127142845187336, "mask/share_reasoning": 0.896579384803772, "mask/share_step_conf": 0.001545402454212308, "num_tokens": 13177893.0, "reward": 0.0242399163544178, "reward_std": 0.06236550211906433, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.00486523425206542, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.010890969075262547, "step": 48 }, { "adv/mean_abs_final_conf": 0.07648647576570511, "adv/mean_abs_reasoning": 0.07989173382520676, "adv/mean_abs_step_conf": 0.08035692572593689, "adv/ratio_final_to_reasoning": 0.9573765908378965, "adv/ratio_step_to_reasoning": 1.0058227788840823, "adv/std_final_conf": 0.32796990871429443, "adv/std_reasoning": 0.33121076226234436, "adv/std_step_conf": 0.3310745358467102, "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.6, "calib/avg_num_step_conf": 0.1796875, "calib/ece": 0.7916666666666667, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0020000000000000018, "calib/mean_conf": 0.9583333333333334, "calib/mu_c": 0.96, "calib/mu_w": 0.958, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.7916666666666667, "calib/std_conf": 0.003726779962499652, "calib/step_conf_rate": 0.046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 641.75390625, "completions/mean_terminated_length": 687.4016723632812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.05226666666666667, "grad_norm": 0.09106607735157013, "learning_rate": 4.194444444444445e-06, "loss": -0.5967, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.00034151755971834064, "mask/share_reasoning": 0.9314118027687073, "mask/share_step_conf": 0.0018404647707939148, "num_tokens": 13446718.0, "reward": 0.021333744749426842, "reward_std": 0.05266954004764557, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.005199609324336052, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.012723932042717934, "step": 49 }, { "adv/mean_abs_final_conf": 0.13274917006492615, "adv/mean_abs_reasoning": 0.1391177475452423, "adv/mean_abs_step_conf": 0.13292866945266724, "adv/ratio_final_to_reasoning": 0.9542216748567968, "adv/ratio_step_to_reasoning": 0.9555119443652411, "adv/std_final_conf": 0.40295490622520447, "adv/std_reasoning": 0.4056834280490875, "adv/std_step_conf": 0.40546315908432007, "calib/answer_extract_rate": 0.0703125, "calib/auroc": 0.5555555555555556, "calib/avg_num_step_conf": 0.21484375, "calib/ece": 0.6661538461538459, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8461538461538461, "calib/gap": -0.029722222222222427, "calib/mean_conf": 0.9430769230769229, "calib/mu_c": 0.9224999999999999, "calib/mu_w": 0.9522222222222223, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.0859375, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.6507692307692305, "calib/std_conf": 0.044442636386628556, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 661.9453125, "completions/mean_terminated_length": 730.4224243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.05333333333333334, "grad_norm": 0.15760673582553864, "learning_rate": 4.166666666666667e-06, "loss": -0.9254, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0007866570958867669, "mask/share_reasoning": 0.9025189876556396, "mask/share_step_conf": 0.002944336738437414, "num_tokens": 13721536.0, "reward": 0.049756817519664764, "reward_std": 0.09445478022098541, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.014207031577825546, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.022097613662481308, "step": 50 }, { "adv/mean_abs_final_conf": 0.05740310996770859, "adv/mean_abs_reasoning": 0.07730336487293243, "adv/mean_abs_step_conf": 0.07719512283802032, "adv/ratio_final_to_reasoning": 0.7425693572597398, "adv/ratio_step_to_reasoning": 0.9985997758947488, "adv/std_final_conf": 0.2839992642402649, "adv/std_reasoning": 0.3312106132507324, "adv/std_step_conf": 0.3307470679283142, "calib/answer_extract_rate": 0.01953125, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.0703125, "calib/ece": 0.9199999999999999, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": -0.8933333333333332, "calib/mean_conf": 0.6699999999999999, "calib/mu_c": 0.0, "calib/mu_w": 0.8933333333333332, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.6699999999999999, "calib/std_conf": 0.3953479480154159, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 640.1328125, "completions/mean_terminated_length": 718.74560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0544, "grad_norm": 0.1499822437763214, "learning_rate": 4.138888888888889e-06, "loss": -0.5292, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.00039907536120153964, "mask/share_reasoning": 0.8892779350280762, "mask/share_step_conf": 0.0009479941800236702, "num_tokens": 13994706.0, "reward": 0.015216129831969738, "reward_std": 0.04303771257400513, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.002262500114738941, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.005863510072231293, "step": 51 }, { "adv/mean_abs_final_conf": 0.09578309953212738, "adv/mean_abs_reasoning": 0.0966200977563858, "adv/mean_abs_step_conf": 0.09649413079023361, "adv/ratio_final_to_reasoning": 0.9913372244109212, "adv/ratio_step_to_reasoning": 0.9986962653829041, "adv/std_final_conf": 0.36706623435020447, "adv/std_reasoning": 0.3702698349952698, "adv/std_step_conf": 0.36978742480278015, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.078125, "calib/ece": 0.888, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8, "calib/mean_conf": 0.8880000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.8880000000000001, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.888, "calib/std_conf": 0.11016351483136327, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 734.859375, "completions/mean_terminated_length": 825.1052856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 9.0, "epoch": 0.055466666666666664, "grad_norm": 0.15719400346279144, "learning_rate": 4.111111111111111e-06, "loss": -0.5916, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0002719433978199959, "mask/share_reasoning": 0.8888888359069824, "mask/share_step_conf": 0.0014642456080764532, "num_tokens": 14290782.0, "reward": 0.01490075420588255, "reward_std": 0.04214569926261902, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.003892968874424696, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.008323774673044682, "step": 52 }, { "adv/mean_abs_final_conf": 0.0574328675866127, "adv/mean_abs_reasoning": 0.05797934532165527, "adv/mean_abs_step_conf": 0.05788220465183258, "adv/ratio_final_to_reasoning": 0.9905746135626257, "adv/ratio_step_to_reasoning": 0.998324564217071, "adv/std_final_conf": 0.2841477394104004, "adv/std_reasoning": 0.2868458330631256, "adv/std_step_conf": 0.28636565804481506, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.6666666666666667, "calib/avg_num_step_conf": 0.046875, "calib/ece": 0.5525, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.20999999999999996, "calib/mean_conf": 0.8025, "calib/mu_c": 0.96, "calib/mu_w": 0.75, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.015625, "calib/pce": 0.5525, "calib/std_conf": 0.27279800219209815, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 721.37890625, "completions/mean_terminated_length": 789.2008666992188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.05653333333333333, "grad_norm": 0.08934785425662994, "learning_rate": 4.083333333333334e-06, "loss": -0.4098, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0001739041181281209, "mask/share_reasoning": 0.913271427154541, "mask/share_step_conf": 0.0006171360146254301, "num_tokens": 14581279.0, "reward": 0.0136606115847826, "reward_std": 0.03863804414868355, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.004512500017881393, "rewards/format_reward_step": 0.01171875, "rewards/stepwise_brier_reward": 0.005533724091947079, "step": 53 }, { "adv/mean_abs_final_conf": 0.12810653448104858, "adv/mean_abs_reasoning": 0.15454918146133423, "adv/mean_abs_step_conf": 0.13039252161979675, "adv/ratio_final_to_reasoning": 0.8289046455616383, "adv/ratio_step_to_reasoning": 0.8436959703498585, "adv/std_final_conf": 0.4014511704444885, "adv/std_reasoning": 0.43816977739334106, "adv/std_step_conf": 0.4052298665046692, "calib/answer_extract_rate": 0.08203125, "calib/auroc": 0.6777777777777778, "calib/avg_num_step_conf": 0.1875, "calib/ece": 0.7527777777777777, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": 0.060666666666666424, "calib/mean_conf": 0.9194444444444445, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.9093333333333337, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.7527777777777777, "calib/std_conf": 0.15668341124070756, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 619.06640625, "completions/mean_terminated_length": 677.269287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0576, "grad_norm": 0.0982208400964737, "learning_rate": 4.055555555555556e-06, "loss": -0.6749, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.0009278756915591657, "mask/share_reasoning": 0.9105731248855591, "mask/share_step_conf": 0.0025614770129323006, "num_tokens": 14845992.0, "reward": 0.04713316261768341, "reward_std": 0.08477863669395447, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.010382031090557575, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.01876281015574932, "step": 54 }, { "adv/mean_abs_final_conf": 0.08847086131572723, "adv/mean_abs_reasoning": 0.09186718612909317, "adv/mean_abs_step_conf": 0.09058874100446701, "adv/ratio_final_to_reasoning": 0.9630300550558567, "adv/ratio_step_to_reasoning": 0.9860837674637202, "adv/std_final_conf": 0.3300953507423401, "adv/std_reasoning": 0.3312879204750061, "adv/std_step_conf": 0.3310956656932831, "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.5833333333333334, "calib/avg_num_step_conf": 0.1484375, "calib/ece": 0.483, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.1283333333333333, "calib/mean_conf": 0.883, "calib/mu_c": 0.96, "calib/mu_w": 0.8316666666666667, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.483, "calib/std_conf": 0.22777401080895948, "calib/step_conf_rate": 0.03125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 573.73046875, "completions/mean_terminated_length": 655.6920166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.058666666666666666, "grad_norm": 0.047732096165418625, "learning_rate": 4.027777777777779e-06, "loss": -0.5643, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0008438924560323358, "mask/share_reasoning": 0.8722988367080688, "mask/share_step_conf": 0.0018572770059108734, "num_tokens": 15100691.0, "reward": 0.04101718217134476, "reward_std": 0.0868811085820198, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.016591796651482582, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.01514472346752882, "step": 55 }, { "adv/mean_abs_final_conf": 0.11497887969017029, "adv/mean_abs_reasoning": 0.13528089225292206, "adv/mean_abs_step_conf": 0.11585846543312073, "adv/ratio_final_to_reasoning": 0.8499269762000461, "adv/ratio_step_to_reasoning": 0.8564288977079703, "adv/std_final_conf": 0.4022437632083893, "adv/std_reasoning": 0.43815046548843384, "adv/std_step_conf": 0.40531063079833984, "calib/answer_extract_rate": 0.0546875, "calib/auroc": 0.7083333333333334, "calib/avg_num_step_conf": 0.2109375, "calib/ece": 0.64625, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.11166666666666669, "calib/mean_conf": 0.89625, "calib/mu_c": 0.98, "calib/mu_w": 0.8683333333333333, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.64625, "calib/std_conf": 0.15889756920733558, "calib/step_conf_rate": 0.05078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 619.46875, "completions/mean_terminated_length": 660.7667236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.05973333333333333, "grad_norm": 0.1226649284362793, "learning_rate": 4.000000000000001e-06, "loss": -0.879, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.00035208818735554814, "mask/share_reasoning": 0.934766411781311, "mask/share_step_conf": 0.002381462138146162, "num_tokens": 15366115.0, "reward": 0.02765616402029991, "reward_std": 0.07822343707084656, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008952734060585499, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.011773454956710339, "step": 56 }, { "adv/mean_abs_final_conf": 0.1396850049495697, "adv/mean_abs_reasoning": 0.14115017652511597, "adv/mean_abs_step_conf": 0.1439182162284851, "adv/ratio_final_to_reasoning": 0.9896197680256846, "adv/ratio_step_to_reasoning": 1.0196106003656085, "adv/std_final_conf": 0.43321725726127625, "adv/std_reasoning": 0.43815895915031433, "adv/std_step_conf": 0.4378523528575897, "calib/answer_extract_rate": 0.046875, "calib/auroc": 0.5208333333333333, "calib/avg_num_step_conf": 0.19140625, "calib/ece": 0.6118181818181818, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.10833333333333339, "calib/mean_conf": 0.8845454545454546, "calib/mu_c": 0.9633333333333334, "calib/mu_w": 0.855, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.6118181818181818, "calib/std_conf": 0.2485727856223728, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 632.21484375, "completions/mean_terminated_length": 674.362548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0608, "grad_norm": 0.09539405256509781, "learning_rate": 3.972222222222223e-06, "loss": -1.0366, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0004407475353218615, "mask/share_reasoning": 0.9352881908416748, "mask/share_step_conf": 0.0017710895044729114, "num_tokens": 15634754.0, "reward": 0.04146331921219826, "reward_std": 0.09492677450180054, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.013380076736211777, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.017642855644226074, "step": 57 }, { "adv/mean_abs_final_conf": 0.056917693465948105, "adv/mean_abs_reasoning": 0.05797205865383148, "adv/mean_abs_step_conf": 0.05793406441807747, "adv/ratio_final_to_reasoning": 0.9818125280977288, "adv/ratio_step_to_reasoning": 0.9993446112379606, "adv/std_final_conf": 0.28159940242767334, "adv/std_reasoning": 0.2868097722530365, "adv/std_step_conf": 0.2866218388080597, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.109375, "calib/ece": 0.835, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.75, "calib/mean_conf": 0.835, "calib/mu_c": NaN, "calib/mu_w": 0.835, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.835, "calib/std_conf": 0.3163463292026636, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 530.86328125, "completions/mean_terminated_length": 593.4541625976562, "completions/min_length": 0.0, "completions/min_terminated_length": 9.0, "epoch": 0.06186666666666667, "grad_norm": 0.036493152379989624, "learning_rate": 3.944444444444445e-06, "loss": -0.387, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.000371251895558089, "mask/share_reasoning": 0.8929678797721863, "mask/share_step_conf": 0.0011921343393623829, "num_tokens": 15876975.0, "reward": 0.00919120479375124, "reward_std": 0.025996655225753784, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0007671875064261258, "rewards/format_reward_step": 0.01171875, "rewards/stepwise_brier_reward": 0.006280066911131144, "step": 58 }, { "adv/mean_abs_final_conf": 0.09573530405759811, "adv/mean_abs_reasoning": 0.09922303259372711, "adv/mean_abs_step_conf": 0.10180923342704773, "adv/ratio_final_to_reasoning": 0.9648496075461668, "adv/ratio_step_to_reasoning": 1.026064521167277, "adv/std_final_conf": 0.36712777614593506, "adv/std_reasoning": 0.37032580375671387, "adv/std_step_conf": 0.37004950642585754, "calib/answer_extract_rate": 0.0546875, "calib/auroc": 0.4375, "calib/avg_num_step_conf": 0.1484375, "calib/ece": 0.6970000000000001, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.07874999999999999, "calib/mean_conf": 0.897, "calib/mu_c": 0.96, "calib/mu_w": 0.88125, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.6970000000000001, "calib/std_conf": 0.19910047714659046, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 568.78125, "completions/mean_terminated_length": 624.9270629882812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.06293333333333333, "grad_norm": 0.11433440446853638, "learning_rate": 3.916666666666667e-06, "loss": -0.618, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0011701658368110657, "mask/share_reasoning": 0.9075009226799011, "mask/share_step_conf": 0.0014852045569568872, "num_tokens": 16128831.0, "reward": 0.02821742370724678, "reward_std": 0.07275127619504929, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.012121873907744884, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.01131141185760498, "step": 59 }, { "adv/mean_abs_final_conf": 0.05764734745025635, "adv/mean_abs_reasoning": 0.08159875124692917, "adv/mean_abs_step_conf": 0.07725159823894501, "adv/ratio_final_to_reasoning": 0.706473402709405, "adv/ratio_step_to_reasoning": 0.946725250796681, "adv/std_final_conf": 0.2852080762386322, "adv/std_reasoning": 0.33123520016670227, "adv/std_step_conf": 0.33098888397216797, "calib/answer_extract_rate": 0.046875, "calib/auroc": 0.41666666666666663, "calib/avg_num_step_conf": 0.0625, "calib/ece": 0.6962499999999999, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": 0.018333333333333424, "calib/mean_conf": 0.94625, "calib/mu_c": 0.96, "calib/mu_w": 0.9416666666666665, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.6962499999999999, "calib/std_conf": 0.0567753247458788, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 649.25390625, "completions/mean_terminated_length": 707.2723388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.064, "grad_norm": 0.136693075299263, "learning_rate": 3.88888888888889e-06, "loss": -0.4638, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0004379056626930833, "mask/share_reasoning": 0.9162116050720215, "mask/share_step_conf": 0.0013192400801926851, "num_tokens": 16403896.0, "reward": 0.021914906799793243, "reward_std": 0.05458424985408783, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.005612500011920929, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.009773565456271172, "step": 60 }, { "adv/mean_abs_final_conf": 0.06954145431518555, "adv/mean_abs_reasoning": 0.0892697125673294, "adv/mean_abs_step_conf": 0.06976863741874695, "adv/ratio_final_to_reasoning": 0.7790039008217449, "adv/ratio_step_to_reasoning": 0.7815488076779202, "adv/std_final_conf": 0.28522348403930664, "adv/std_reasoning": 0.33124879002571106, "adv/std_step_conf": 0.28665006160736084, "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.7, "calib/avg_num_step_conf": 0.12890625, "calib/ece": 0.46125, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": 0.20333333333333325, "calib/mean_conf": 0.8362499999999999, "calib/mu_c": 0.9633333333333333, "calib/mu_w": 0.76, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.06640625, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.46125, "calib/std_conf": 0.3165808546011587, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 600.72265625, "completions/mean_terminated_length": 657.2008666992188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.06506666666666666, "grad_norm": 0.038836970925331116, "learning_rate": 3.861111111111112e-06, "loss": -0.3511, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.00039035986992530525, "mask/share_reasoning": 0.9121092557907104, "mask/share_step_conf": 0.0015629110857844353, "num_tokens": 16661745.0, "reward": 0.029278146103024483, "reward_std": 0.06075853109359741, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.00908671785145998, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.011044181883335114, "step": 61 }, { "adv/mean_abs_final_conf": 0.03828040510416031, "adv/mean_abs_reasoning": 0.03864803910255432, "adv/mean_abs_step_conf": 0.0386255644261837, "adv/ratio_final_to_reasoning": 0.9904876416260479, "adv/ratio_step_to_reasoning": 0.9994184782231517, "adv/std_final_conf": 0.23195363581180573, "adv/std_reasoning": 0.23417921364307404, "adv/std_step_conf": 0.23404304683208466, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.06640625, "calib/ece": 0.9480000000000001, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.8, "calib/mean_conf": 0.9480000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.9480000000000001, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.9480000000000001, "calib/std_conf": 0.023999999999999973, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 687.5703125, "completions/mean_terminated_length": 730.3651733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.06613333333333334, "grad_norm": 0.039971452206373215, "learning_rate": 3.833333333333334e-06, "loss": -0.2889, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.00042069313349202275, "mask/share_reasoning": 0.9400171041488647, "mask/share_step_conf": 0.0009684442775323987, "num_tokens": 16944843.0, "reward": 0.006424275226891041, "reward_std": 0.018170595169067383, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0010484375525265932, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.004511831793934107, "step": 62 }, { "adv/mean_abs_final_conf": 0.09742030501365662, "adv/mean_abs_reasoning": 0.11756224930286407, "adv/mean_abs_step_conf": 0.09357871860265732, "adv/ratio_final_to_reasoning": 0.8286699649875042, "adv/ratio_step_to_reasoning": 0.7959929242386274, "adv/std_final_conf": 0.32832229137420654, "adv/std_reasoning": 0.3703429102897644, "adv/std_step_conf": 0.33102813363075256, "calib/answer_extract_rate": 0.0546875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.16796875, "calib/ece": 0.45999999999999996, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.96, "calib/mu_c": 0.96, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.45999999999999996, "calib/std_conf": 0.004082482904638634, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 640.94140625, "completions/mean_terminated_length": 689.416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0672, "grad_norm": 0.06894773989915848, "learning_rate": 3.8055555555555556e-06, "loss": -0.5592, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0006614683079533279, "mask/share_reasoning": 0.926390528678894, "mask/share_step_conf": 0.002635482233017683, "num_tokens": 17217564.0, "reward": 0.061490196734666824, "reward_std": 0.0757787898182869, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.021030467003583908, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.026527659967541695, "step": 63 }, { "adv/mean_abs_final_conf": 0.07636594772338867, "adv/mean_abs_reasoning": 0.07729607820510864, "adv/mean_abs_step_conf": 0.07724502682685852, "adv/ratio_final_to_reasoning": 0.9879666536347184, "adv/ratio_step_to_reasoning": 0.999339534690044, "adv/std_final_conf": 0.327199786901474, "adv/std_reasoning": 0.3311794102191925, "adv/std_step_conf": 0.3309606909751892, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 0.09765625, "calib/ece": 0.93, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.8, "calib/mean_conf": 0.93, "calib/mu_c": NaN, "calib/mu_w": 0.93, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.93, "calib/std_conf": 0.06511528238439879, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 587.94140625, "completions/mean_terminated_length": 637.7669677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.06826666666666667, "grad_norm": 0.09593202918767929, "learning_rate": 3.777777777777778e-06, "loss": -0.561, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.00035268941428512335, "mask/share_reasoning": 0.9200137853622437, "mask/share_step_conf": 0.0015085700433701277, "num_tokens": 17471853.0, "reward": 0.012422004714608192, "reward_std": 0.03513474017381668, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0022496094461530447, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.008094205521047115, "step": 64 }, { "adv/mean_abs_final_conf": 0.20421713590621948, "adv/mean_abs_reasoning": 0.22086596488952637, "adv/mean_abs_step_conf": 0.22406552731990814, "adv/ratio_final_to_reasoning": 0.9246202148364762, "adv/ratio_step_to_reasoning": 1.0144864439932253, "adv/std_final_conf": 0.5198385119438171, "adv/std_reasoning": 0.5237278342247009, "adv/std_step_conf": 0.5231757164001465, "calib/answer_extract_rate": 0.0859375, "calib/auroc": 0.5882352941176471, "calib/avg_num_step_conf": 0.3125, "calib/ece": 0.7566000000000002, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.0625, "calib/frac_conf_gt_0.9": 0.85, "calib/gap": 0.05105882352941182, "calib/mean_conf": 0.9066000000000001, "calib/mu_c": 0.9500000000000001, "calib/mu_w": 0.8989411764705882, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.7566000000000002, "calib/std_conf": 0.14892159010700898, "calib/step_conf_rate": 0.078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 584.03125, "completions/mean_terminated_length": 615.2756958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.06933333333333333, "grad_norm": 0.10870470106601715, "learning_rate": 3.7500000000000005e-06, "loss": -1.1016, "mask/has_final_conf_rate": 0.078125, "mask/share_final_conf": 0.0013415388530120254, "mask/share_reasoning": 0.9434852600097656, "mask/share_step_conf": 0.0043919761665165424, "num_tokens": 17726389.0, "reward": 0.06128288805484772, "reward_std": 0.13415977358818054, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.017960937693715096, "rewards/format_reward_step": 0.0625, "rewards/stepwise_brier_reward": 0.02764780819416046, "step": 65 }, { "adv/mean_abs_final_conf": 0.10111625492572784, "adv/mean_abs_reasoning": 0.1084204912185669, "adv/mean_abs_step_conf": 0.10549107193946838, "adv/ratio_final_to_reasoning": 0.93263048146393, "adv/ratio_step_to_reasoning": 0.9729809444121311, "adv/std_final_conf": 0.3663294315338135, "adv/std_reasoning": 0.37030887603759766, "adv/std_step_conf": 0.3700118362903595, "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.4545454545454546, "calib/avg_num_step_conf": 0.171875, "calib/ece": 0.7338461538461541, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.9230769230769231, "calib/gap": 0.08545454545454534, "calib/mean_conf": 0.8876923076923079, "calib/mu_c": 0.96, "calib/mu_w": 0.8745454545454546, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.7338461538461541, "calib/std_conf": 0.2562797103573133, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 519.078125, "completions/mean_terminated_length": 570.3175659179688, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.0704, "grad_norm": 0.05091705545783043, "learning_rate": 3.7222222222222225e-06, "loss": -0.6861, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0012055350234732032, "mask/share_reasoning": 0.9070212841033936, "mask/share_step_conf": 0.0019294099183753133, "num_tokens": 17965625.0, "reward": 0.029108598828315735, "reward_std": 0.05871621146798134, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.005662109702825546, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.012417396530508995, "step": 66 }, { "adv/mean_abs_final_conf": 0.15815921127796173, "adv/mean_abs_reasoning": 0.1605871617794037, "adv/mean_abs_step_conf": 0.16047969460487366, "adv/ratio_final_to_reasoning": 0.9848807932431286, "adv/ratio_step_to_reasoning": 0.9993307860146525, "adv/std_final_conf": 0.4613058865070343, "adv/std_reasoning": 0.4684103727340698, "adv/std_step_conf": 0.4680980443954468, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.42500000000000004, "calib/avg_num_step_conf": 0.15625, "calib/ece": 0.7958666666666669, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.015040000000000164, "calib/mean_conf": 0.9625333333333336, "calib/mu_c": 0.95, "calib/mu_w": 0.9650400000000001, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.7958666666666669, "calib/std_conf": 0.016449788921307013, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2982.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 598.203125, "completions/mean_terminated_length": 651.6595458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 6.0, "epoch": 0.07146666666666666, "grad_norm": 0.09449908882379532, "learning_rate": 3.694444444444445e-06, "loss": -0.9013, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0004775570996571332, "mask/share_reasoning": 0.9159537553787231, "mask/share_step_conf": 0.0015373954083770514, "num_tokens": 18223773.0, "reward": 0.03755014389753342, "reward_std": 0.10067607462406158, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.009698078036308289, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.01946999505162239, "step": 67 }, { "adv/mean_abs_final_conf": 0.2543319761753082, "adv/mean_abs_reasoning": 0.2732967436313629, "adv/mean_abs_step_conf": 0.2662087380886078, "adv/ratio_final_to_reasoning": 0.9306074152071298, "adv/ratio_step_to_reasoning": 0.9740648005952248, "adv/std_final_conf": 0.5624877214431763, "adv/std_reasoning": 0.5737119317054749, "adv/std_step_conf": 0.57329261302948, "calib/answer_extract_rate": 0.1328125, "calib/auroc": 0.28985507246376807, "calib/avg_num_step_conf": 0.40625, "calib/ece": 0.8156538461538463, "calib/final_conf_rate": 0.1015625, "calib/format_rate": 0.078125, "calib/frac_conf_gt_0.9": 0.7692307692307693, "calib/gap": -0.2426231884057971, "calib/mean_conf": 0.8579615384615384, "calib/mu_c": 0.6433333333333333, "calib/mu_w": 0.8859565217391304, "calib/nonempty_final_conf_rate": 0.1015625, "calib/nonempty_reasoning_rate": 0.1484375, "calib/nonempty_step_conf_rate": 0.09765625, "calib/pce": 0.7791153846153848, "calib/std_conf": 0.2582456315468235, "calib/step_conf_rate": 0.09765625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 600.4140625, "completions/mean_terminated_length": 656.86328125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.07253333333333334, "grad_norm": 0.15409648418426514, "learning_rate": 3.6666666666666666e-06, "loss": -1.4648, "mask/has_final_conf_rate": 0.1015625, "mask/share_final_conf": 0.002301223110407591, "mask/share_reasoning": 0.9055416584014893, "mask/share_step_conf": 0.006219647824764252, "num_tokens": 18481567.0, "reward": 0.07864746451377869, "reward_std": 0.15497872233390808, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.017530042678117752, "rewards/format_reward_step": 0.078125, "rewards/stepwise_brier_reward": 0.03915491700172424, "step": 68 }, { "adv/mean_abs_final_conf": 0.07668646425008774, "adv/mean_abs_reasoning": 0.07989902049303055, "adv/mean_abs_step_conf": 0.08253868669271469, "adv/ratio_final_to_reasoning": 0.9597922950354186, "adv/ratio_step_to_reasoning": 1.0330375289133162, "adv/std_final_conf": 0.32882875204086304, "adv/std_reasoning": 0.33124199509620667, "adv/std_step_conf": 0.33112096786499023, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.06640625, "calib/ece": 0.6283333333333332, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0050000000000000044, "calib/mean_conf": 0.9616666666666666, "calib/mu_c": 0.965, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.6283333333333332, "calib/std_conf": 0.0037267799624996524, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 669.1171875, "completions/mean_terminated_length": 716.7113037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0736, "grad_norm": 0.07533657550811768, "learning_rate": 3.638888888888889e-06, "loss": -0.607, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0002714511356316507, "mask/share_reasoning": 0.9325519800186157, "mask/share_step_conf": 0.000770289683714509, "num_tokens": 18757357.0, "reward": 0.027300164103507996, "reward_std": 0.06720856577157974, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008721483871340752, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.015083336271345615, "step": 69 }, { "adv/mean_abs_final_conf": 0.1340787559747696, "adv/mean_abs_reasoning": 0.15459944307804108, "adv/mean_abs_step_conf": 0.1544829159975052, "adv/ratio_final_to_reasoning": 0.8672654526128355, "adv/ratio_step_to_reasoning": 0.9992462645516965, "adv/std_final_conf": 0.4342644512653351, "adv/std_reasoning": 0.46838048100471497, "adv/std_step_conf": 0.4680275321006775, "calib/answer_extract_rate": 0.0625, "calib/auroc": 0.5555555555555556, "calib/avg_num_step_conf": 0.15625, "calib/ece": 0.8126184, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.05264622222222204, "calib/mean_conf": 0.9126183999999998, "calib/mu_c": 0.96, "calib/mu_w": 0.9073537777777779, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.8126184, "calib/std_conf": 0.13736768239669767, "calib/step_conf_rate": 0.046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 617.078125, "completions/mean_terminated_length": 660.970703125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.07466666666666667, "grad_norm": 0.16354045271873474, "learning_rate": 3.6111111111111115e-06, "loss": -0.8991, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0004902217769995332, "mask/share_reasoning": 0.9311144351959229, "mask/share_step_conf": 0.0019891324918717146, "num_tokens": 19022321.0, "reward": 0.03013056144118309, "reward_std": 0.08522209525108337, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00877256877720356, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.016812337562441826, "step": 70 }, { "adv/mean_abs_final_conf": 0.18683239817619324, "adv/mean_abs_reasoning": 0.188472718000412, "adv/mean_abs_step_conf": 0.19015845656394958, "adv/ratio_final_to_reasoning": 0.9912967784323291, "adv/ratio_step_to_reasoning": 1.0089442046648571, "adv/std_final_conf": 0.49154990911483765, "adv/std_reasoning": 0.49680474400520325, "adv/std_step_conf": 0.49653515219688416, "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.2142857142857143, "calib/avg_num_step_conf": 0.21484375, "calib/ece": 0.7658, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.12592857142857117, "calib/mean_conf": 0.8324666666666668, "calib/mu_c": 0.95, "calib/mu_w": 0.8240714285714288, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.0859375, "calib/nonempty_step_conf_rate": 0.0625, "calib/pce": 0.7658, "calib/std_conf": 0.29652180732995376, "calib/step_conf_rate": 0.0625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 595.44921875, "completions/mean_terminated_length": 648.6595458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.07573333333333333, "grad_norm": 0.11147620528936386, "learning_rate": 3.5833333333333335e-06, "loss": -1.0824, "mask/has_final_conf_rate": 0.05859375, "mask/share_final_conf": 0.0007515978068113327, "mask/share_reasoning": 0.9147189855575562, "mask/share_step_conf": 0.0024981689639389515, "num_tokens": 19279164.0, "reward": 0.043333299458026886, "reward_std": 0.10258567333221436, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.01187230832874775, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.026042941957712173, "step": 71 }, { "adv/mean_abs_final_conf": 0.21666905283927917, "adv/mean_abs_reasoning": 0.23212051391601562, "adv/mean_abs_step_conf": 0.23053902387619019, "adv/ratio_final_to_reasoning": 0.9334334530970106, "adv/ratio_step_to_reasoning": 0.9931867717628885, "adv/std_final_conf": 0.5434837341308594, "adv/std_reasoning": 0.5492792129516602, "adv/std_step_conf": 0.5488502383232117, "calib/answer_extract_rate": 0.1015625, "calib/auroc": 0.703125, "calib/avg_num_step_conf": 0.3125, "calib/ece": 0.7449999999999998, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.0625, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.02499999999999991, "calib/mean_conf": 0.9449999999999997, "calib/mu_c": 0.9649999999999999, "calib/mu_w": 0.94, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.11328125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.7449999999999998, "calib/std_conf": 0.05843800133474792, "calib/step_conf_rate": 0.078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 536.8046875, "completions/mean_terminated_length": 577.4033813476562, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0768, "grad_norm": 0.10440513491630554, "learning_rate": 3.555555555555556e-06, "loss": -1.3869, "mask/has_final_conf_rate": 0.078125, "mask/share_final_conf": 0.001047117868438363, "mask/share_reasoning": 0.9244768619537354, "mask/share_step_conf": 0.004163539037108421, "num_tokens": 19520994.0, "reward": 0.06722670793533325, "reward_std": 0.1429571509361267, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.015968751162290573, "rewards/format_reward_step": 0.0625, "rewards/stepwise_brier_reward": 0.03271903842687607, "step": 72 }, { "adv/mean_abs_final_conf": 0.06860332190990448, "adv/mean_abs_reasoning": 0.09225049614906311, "adv/mean_abs_step_conf": 0.072090283036232, "adv/ratio_final_to_reasoning": 0.7436634465255525, "adv/ratio_step_to_reasoning": 0.7814622798314796, "adv/std_final_conf": 0.2840113639831543, "adv/std_reasoning": 0.33122873306274414, "adv/std_step_conf": 0.2866162061691284, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.5833333333333333, "calib/avg_num_step_conf": 0.0703125, "calib/ece": 0.8088228571428571, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009706666666666752, "calib/mean_conf": 0.9516799999999999, "calib/mu_c": 0.96, "calib/mu_w": 0.9502933333333332, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.8088228571428571, "calib/std_conf": 0.020379754659956027, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 542.765625, "completions/mean_terminated_length": 588.7626953125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.07786666666666667, "grad_norm": 0.08159588277339935, "learning_rate": 3.5277777777777784e-06, "loss": -0.5564, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0004280888824723661, "mask/share_reasoning": 0.9201780557632446, "mask/share_step_conf": 0.001268864842131734, "num_tokens": 19766974.0, "reward": 0.02091187797486782, "reward_std": 0.041246477514505386, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0022610502783209085, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.009443232789635658, "step": 73 }, { "adv/mean_abs_final_conf": 0.12885695695877075, "adv/mean_abs_reasoning": 0.133488729596138, "adv/mean_abs_step_conf": 0.12991607189178467, "adv/ratio_final_to_reasoning": 0.965302144597672, "adv/ratio_step_to_reasoning": 0.9732362596066185, "adv/std_final_conf": 0.4013332724571228, "adv/std_reasoning": 0.4056570529937744, "adv/std_step_conf": 0.40540289878845215, "calib/answer_extract_rate": 0.0703125, "calib/auroc": 0.5454545454545454, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.8733333333333333, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0036363636363636598, "calib/mean_conf": 0.9566666666666667, "calib/mu_c": 0.96, "calib/mu_w": 0.9563636363636363, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.8733333333333333, "calib/std_conf": 0.010274023338281613, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 643.421875, "completions/mean_terminated_length": 683.4689331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.07893333333333333, "grad_norm": 0.09621241688728333, "learning_rate": 3.5e-06, "loss": -0.984, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0008580433786846697, "mask/share_reasoning": 0.9374440312385559, "mask/share_step_conf": 0.0031041507609188557, "num_tokens": 20035618.0, "reward": 0.036248065531253815, "reward_std": 0.07528281211853027, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.006729297339916229, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.022256478667259216, "step": 74 }, { "adv/mean_abs_final_conf": 0.1569603532552719, "adv/mean_abs_reasoning": 0.18250501155853271, "adv/mean_abs_step_conf": 0.1614481508731842, "adv/ratio_final_to_reasoning": 0.860033113145125, "adv/ratio_step_to_reasoning": 0.8846231097681655, "adv/std_final_conf": 0.465222030878067, "adv/std_reasoning": 0.49683383107185364, "adv/std_step_conf": 0.4681553244590759, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.6481481481481481, "calib/avg_num_step_conf": 0.20703125, "calib/ece": 0.6158333333333333, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.13888888888888906, "calib/mean_conf": 0.8658333333333333, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.831111111111111, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.0625, "calib/pce": 0.6158333333333333, "calib/std_conf": 0.22343747571872435, "calib/step_conf_rate": 0.0625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 576.109375, "completions/mean_terminated_length": 611.9668579101562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.08, "grad_norm": 0.07777323573827744, "learning_rate": 3.4722222222222224e-06, "loss": -0.8569, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0005481348489411175, "mask/share_reasoning": 0.9385069608688354, "mask/share_step_conf": 0.0023511142935603857, "num_tokens": 20287854.0, "reward": 0.04728742688894272, "reward_std": 0.11980638653039932, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.016207030043005943, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.023971345275640488, "step": 75 }, { "adv/mean_abs_final_conf": 0.16909301280975342, "adv/mean_abs_reasoning": 0.1740565448999405, "adv/mean_abs_step_conf": 0.17162185907363892, "adv/ratio_final_to_reasoning": 0.9714832206221234, "adv/ratio_step_to_reasoning": 0.9860120983804361, "adv/std_final_conf": 0.4644908905029297, "adv/std_reasoning": 0.4684191942214966, "adv/std_step_conf": 0.4681659936904907, "calib/answer_extract_rate": 0.0859375, "calib/auroc": 0.7833333333333333, "calib/avg_num_step_conf": 0.20703125, "calib/ece": 0.6052941176470588, "calib/final_conf_rate": 0.06640625, "calib/format_rate": 0.05859375, "calib/frac_conf_gt_0.9": 0.8235294117647058, "calib/gap": 0.0858333333333331, "calib/mean_conf": 0.8994117647058824, "calib/mu_c": 0.96, "calib/mu_w": 0.8741666666666669, "calib/nonempty_final_conf_rate": 0.06640625, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.6052941176470588, "calib/std_conf": 0.17962282667186732, "calib/step_conf_rate": 0.06640625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 599.14453125, "completions/mean_terminated_length": 644.4580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.08106666666666666, "grad_norm": 0.14026468992233276, "learning_rate": 3.444444444444445e-06, "loss": -0.8141, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.0010099567007273436, "mask/share_reasoning": 0.924968957901001, "mask/share_step_conf": 0.003708635224029422, "num_tokens": 20544291.0, "reward": 0.07288633286952972, "reward_std": 0.1086694523692131, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.024304687976837158, "rewards/format_reward_step": 0.05859375, "rewards/stepwise_brier_reward": 0.035964079201221466, "step": 76 }, { "adv/mean_abs_final_conf": 0.12345867604017258, "adv/mean_abs_reasoning": 0.1375705897808075, "adv/mean_abs_step_conf": 0.1388530731201172, "adv/ratio_final_to_reasoning": 0.8974205623228078, "adv/ratio_step_to_reasoning": 1.0093223656404546, "adv/std_final_conf": 0.4033997654914856, "adv/std_reasoning": 0.4056600034236908, "adv/std_step_conf": 0.4053976833820343, "calib/answer_extract_rate": 0.06640625, "calib/auroc": 0.6818181818181819, "calib/avg_num_step_conf": 0.26171875, "calib/ece": 0.7975000000000001, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": 0.08636363636363642, "calib/mean_conf": 0.8808333333333334, "calib/mu_c": 0.96, "calib/mu_w": 0.8736363636363635, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.7975000000000001, "calib/std_conf": 0.18918502113598268, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 581.2734375, "completions/mean_terminated_length": 617.4523315429688, "completions/min_length": 0.0, "completions/min_terminated_length": 7.0, "epoch": 0.08213333333333334, "grad_norm": 0.11724759638309479, "learning_rate": 3.416666666666667e-06, "loss": -0.796, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.00063959916587919, "mask/share_reasoning": 0.9372695684432983, "mask/share_step_conf": 0.0034970752894878387, "num_tokens": 20797761.0, "reward": 0.04430808871984482, "reward_std": 0.07722712308168411, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.012422265484929085, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.02771754562854767, "step": 77 }, { "adv/mean_abs_final_conf": 0.18359941244125366, "adv/mean_abs_reasoning": 0.188472718000412, "adv/mean_abs_step_conf": 0.18686653673648834, "adv/ratio_final_to_reasoning": 0.9741431777985625, "adv/ratio_step_to_reasoning": 0.9914779110686983, "adv/std_final_conf": 0.4926977753639221, "adv/std_reasoning": 0.49680477380752563, "adv/std_step_conf": 0.49635177850723267, "calib/answer_extract_rate": 0.0625, "calib/auroc": 0.5833333333333334, "calib/avg_num_step_conf": 0.25390625, "calib/ece": 0.8392307692307694, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.7692307692307693, "calib/gap": 0.047499999999999876, "calib/mean_conf": 0.9161538461538463, "calib/mu_c": 0.96, "calib/mu_w": 0.9125000000000001, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.0625, "calib/pce": 0.8392307692307694, "calib/std_conf": 0.12219394807741323, "calib/step_conf_rate": 0.0625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 650.0625, "completions/mean_terminated_length": 693.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0832, "grad_norm": 0.1859666258096695, "learning_rate": 3.3888888888888893e-06, "loss": -1.4194, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0006620531203225255, "mask/share_reasoning": 0.9336406588554382, "mask/share_step_conf": 0.003197286743670702, "num_tokens": 21072201.0, "reward": 0.040519773960113525, "reward_std": 0.09668624401092529, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.010687890462577343, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.02100810408592224, "step": 78 }, { "adv/mean_abs_final_conf": 0.13988946378231049, "adv/mean_abs_reasoning": 0.14555124938488007, "adv/mean_abs_step_conf": 0.1389458030462265, "adv/ratio_final_to_reasoning": 0.9611010855180078, "adv/ratio_step_to_reasoning": 0.9546177283495051, "adv/std_final_conf": 0.4337327182292938, "adv/std_reasoning": 0.4381541311740875, "adv/std_step_conf": 0.4375132620334625, "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.6875, "calib/avg_num_step_conf": 0.10546875, "calib/ece": 0.660609090909091, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.041245833333333315, "calib/mean_conf": 0.9333363636363636, "calib/mu_c": 0.9633333333333333, "calib/mu_w": 0.9220875, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.660609090909091, "calib/std_conf": 0.08464065510065343, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 602.8046875, "completions/mean_terminated_length": 651.1307983398438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.08426666666666667, "grad_norm": 0.13712556660175323, "learning_rate": 3.3611111111111117e-06, "loss": -0.9517, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.000740687595680356, "mask/share_reasoning": 0.9237175583839417, "mask/share_step_conf": 0.0013229991309344769, "num_tokens": 21332895.0, "reward": 0.038617219775915146, "reward_std": 0.08786404132843018, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.011576997116208076, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.01675843819975853, "step": 79 }, { "adv/mean_abs_final_conf": 0.17335572838783264, "adv/mean_abs_reasoning": 0.22115306556224823, "adv/mean_abs_step_conf": 0.20430010557174683, "adv/ratio_final_to_reasoning": 0.7838721473161673, "adv/ratio_step_to_reasoning": 0.9237950423718735, "adv/std_final_conf": 0.4925078749656677, "adv/std_reasoning": 0.5492574572563171, "adv/std_step_conf": 0.5232893824577332, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.6309523809523809, "calib/avg_num_step_conf": 0.19140625, "calib/ece": 0.7464705882352941, "calib/final_conf_rate": 0.06640625, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.8823529411764706, "calib/gap": 0.061190476190475906, "calib/mean_conf": 0.9229411764705882, "calib/mu_c": 0.9733333333333333, "calib/mu_w": 0.9121428571428574, "calib/nonempty_final_conf_rate": 0.06640625, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.7464705882352941, "calib/std_conf": 0.13714589508910713, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 586.4296875, "completions/mean_terminated_length": 612.7591552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.08533333333333333, "grad_norm": 0.1025858148932457, "learning_rate": 3.3333333333333333e-06, "loss": -1.0038, "mask/has_final_conf_rate": 0.06640625, "mask/share_final_conf": 0.0011797649785876274, "mask/share_reasoning": 0.9526797533035278, "mask/share_step_conf": 0.003171744290739298, "num_tokens": 21585181.0, "reward": 0.05320657789707184, "reward_std": 0.13630428910255432, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.014562109485268593, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.028819594532251358, "step": 80 }, { "adv/mean_abs_final_conf": 0.07615146785974503, "adv/mean_abs_reasoning": 0.09662556648254395, "adv/mean_abs_step_conf": 0.07724253088235855, "adv/ratio_final_to_reasoning": 0.788108889105476, "adv/ratio_step_to_reasoning": 0.7994005488838498, "adv/std_final_conf": 0.32627594470977783, "adv/std_reasoning": 0.3702907860279083, "adv/std_step_conf": 0.330949991941452, "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.9, "calib/avg_num_step_conf": 0.0703125, "calib/ece": 0.7966666666666666, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008000000000000007, "calib/mean_conf": 0.9633333333333333, "calib/mu_c": 0.97, "calib/mu_w": 0.962, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.7966666666666666, "calib/std_conf": 0.004714045207910321, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2901.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 640.1875, "completions/mean_terminated_length": 674.4362182617188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0864, "grad_norm": 0.04565536603331566, "learning_rate": 3.3055555555555558e-06, "loss": -0.3859, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.00029859773349016905, "mask/share_reasoning": 0.9474666714668274, "mask/share_step_conf": 0.0014534371439367533, "num_tokens": 21855317.0, "reward": 0.01604899764060974, "reward_std": 0.04539342224597931, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0011496094521135092, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.00808569323271513, "step": 81 }, { "adv/mean_abs_final_conf": 0.1454237550497055, "adv/mean_abs_reasoning": 0.14722901582717896, "adv/mean_abs_step_conf": 0.14664122462272644, "adv/ratio_final_to_reasoning": 0.9877384171365208, "adv/ratio_step_to_reasoning": 0.9960076401981626, "adv/std_final_conf": 0.4333024024963379, "adv/std_reasoning": 0.4381259083747864, "adv/std_step_conf": 0.4377772808074951, "calib/answer_extract_rate": 0.0703125, "calib/avg_num_step_conf": 0.14453125, "calib/ece": 0.8723784615384617, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.7692307692307693, "calib/mean_conf": 0.8723784615384618, "calib/mu_c": NaN, "calib/mu_w": 0.8723784615384618, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.8723784615384617, "calib/std_conf": 0.2538613953093348, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 563.3671875, "completions/mean_terminated_length": 600.925048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.08746666666666666, "grad_norm": 0.07771904021501541, "learning_rate": 3.277777777777778e-06, "loss": -0.8274, "mask/has_final_conf_rate": 0.05078125, "mask/share_final_conf": 0.0006377262761816382, "mask/share_reasoning": 0.9350779056549072, "mask/share_step_conf": 0.0017843758687376976, "num_tokens": 22105091.0, "reward": 0.026922065764665604, "reward_std": 0.063670314848423, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0033345031552016735, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.017020627856254578, "step": 82 }, { "adv/mean_abs_final_conf": 0.07634367048740387, "adv/mean_abs_reasoning": 0.07730336487293243, "adv/mean_abs_step_conf": 0.07725274562835693, "adv/ratio_final_to_reasoning": 0.9875853478421532, "adv/ratio_step_to_reasoning": 0.9993451870476957, "adv/std_final_conf": 0.32711878418922424, "adv/std_reasoning": 0.3312106430530548, "adv/std_step_conf": 0.33099377155303955, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.7, "calib/avg_num_step_conf": 0.16796875, "calib/ece": 0.7183333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.1020000000000002, "calib/mean_conf": 0.8849999999999998, "calib/mu_c": 0.97, "calib/mu_w": 0.8679999999999998, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.7183333333333333, "calib/std_conf": 0.1359840676942217, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 591.35546875, "completions/mean_terminated_length": 641.4703369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.08853333333333334, "grad_norm": 0.10863806307315826, "learning_rate": 3.2500000000000002e-06, "loss": -0.4169, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0002931926283054054, "mask/share_reasoning": 0.9194286465644836, "mask/share_step_conf": 0.002153177745640278, "num_tokens": 22363742.0, "reward": 0.017213165760040283, "reward_std": 0.048686183989048004, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.005438281688839197, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.008269689977169037, "step": 83 }, { "adv/mean_abs_final_conf": 0.1318395733833313, "adv/mean_abs_reasoning": 0.1352827101945877, "adv/mean_abs_step_conf": 0.13516740500926971, "adv/ratio_final_to_reasoning": 0.9745485819562317, "adv/ratio_step_to_reasoning": 0.9991476724176198, "adv/std_final_conf": 0.4273732602596283, "adv/std_reasoning": 0.4381563663482666, "adv/std_step_conf": 0.43778303265571594, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.625, "calib/avg_num_step_conf": 0.11328125, "calib/ece": 0.7588087960118102, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.007739004985237163, "calib/mean_conf": 0.9588087960118102, "calib/mu_c": 0.965, "calib/mu_w": 0.9572609950147628, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.7588087960118102, "calib/std_conf": 0.022773163829472608, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 582.21875, "completions/mean_terminated_length": 605.8861694335938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0896, "grad_norm": 0.13366936147212982, "learning_rate": 3.2222222222222227e-06, "loss": -0.7068, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0006747072329744697, "mask/share_reasoning": 0.9586551189422607, "mask/share_step_conf": 0.001607641694135964, "num_tokens": 22618710.0, "reward": 0.03124072588980198, "reward_std": 0.08836211264133453, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0091178547590971, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.014953775331377983, "step": 84 }, { "adv/mean_abs_final_conf": 0.12445656955242157, "adv/mean_abs_reasoning": 0.13518846035003662, "adv/mean_abs_step_conf": 0.13008637726306915, "adv/ratio_final_to_reasoning": 0.9206153338100936, "adv/ratio_step_to_reasoning": 0.9622594778152151, "adv/std_final_conf": 0.4016876816749573, "adv/std_reasoning": 0.4056515097618103, "adv/std_step_conf": 0.4053715169429779, "calib/answer_extract_rate": 0.05859375, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.88, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.7777777777777778, "calib/mean_conf": 0.88, "calib/mu_c": NaN, "calib/mu_w": 0.88, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.88, "calib/std_conf": 0.1567021236472421, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 534.28515625, "completions/mean_terminated_length": 589.5560302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.09066666666666667, "grad_norm": 0.11454667896032333, "learning_rate": 3.1944444444444443e-06, "loss": -0.8361, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0006141673075035214, "mask/share_reasoning": 0.9030116200447083, "mask/share_step_conf": 0.0026242309249937534, "num_tokens": 22863311.0, "reward": 0.03183240070939064, "reward_std": 0.0626763254404068, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0070679690688848495, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.01716206967830658, "step": 85 }, { "adv/mean_abs_final_conf": 0.10018318891525269, "adv/mean_abs_reasoning": 0.10592220723628998, "adv/mean_abs_step_conf": 0.10617843270301819, "adv/ratio_final_to_reasoning": 0.9458185542882923, "adv/ratio_step_to_reasoning": 1.002418996671365, "adv/std_final_conf": 0.36842837929725647, "adv/std_reasoning": 0.3703352212905884, "adv/std_step_conf": 0.37000808119773865, "calib/answer_extract_rate": 0.06640625, "calib/auroc": 0.7142857142857143, "calib/avg_num_step_conf": 0.15625, "calib/ece": 0.7266666666666666, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.7777777777777778, "calib/gap": 0.020714285714285685, "calib/mean_conf": 0.9488888888888888, "calib/mu_c": 0.965, "calib/mu_w": 0.9442857142857143, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.7266666666666666, "calib/std_conf": 0.03142696805273543, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 593.3984375, "completions/mean_terminated_length": 632.9583740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.09173333333333333, "grad_norm": 0.14420361816883087, "learning_rate": 3.1666666666666667e-06, "loss": -0.7468, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0007306640036404133, "mask/share_reasoning": 0.9340777397155762, "mask/share_step_conf": 0.0026915583293884993, "num_tokens": 23120733.0, "reward": 0.03548293933272362, "reward_std": 0.07222367823123932, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.01073281280696392, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.014818215742707253, "step": 86 }, { "adv/mean_abs_final_conf": 0.0819677859544754, "adv/mean_abs_reasoning": 0.08327651768922806, "adv/mean_abs_step_conf": 0.07941487431526184, "adv/ratio_final_to_reasoning": 0.9842845045510117, "adv/ratio_step_to_reasoning": 0.9536286641046023, "adv/std_final_conf": 0.32665571570396423, "adv/std_reasoning": 0.33119046688079834, "adv/std_step_conf": 0.33089470863342285, "calib/answer_extract_rate": 0.0625, "calib/avg_num_step_conf": 0.14453125, "calib/ece": 0.9466666666666667, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8888888888888888, "calib/mean_conf": 0.9466666666666667, "calib/mu_c": NaN, "calib/mu_w": 0.9466666666666667, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.9466666666666667, "calib/std_conf": 0.025385910352879647, "calib/step_conf_rate": 0.03125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 535.41015625, "completions/mean_terminated_length": 575.9033813476562, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0928, "grad_norm": 0.07893165200948715, "learning_rate": 3.138888888888889e-06, "loss": -0.6318, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0013570425799116492, "mask/share_reasoning": 0.9254021644592285, "mask/share_step_conf": 0.0029283100739121437, "num_tokens": 23363294.0, "reward": 0.013725357130169868, "reward_std": 0.033733196556568146, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0015304686967283487, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.007154230028390884, "step": 87 }, { "adv/mean_abs_final_conf": 0.12596651911735535, "adv/mean_abs_reasoning": 0.12843841314315796, "adv/mean_abs_step_conf": 0.12427614629268646, "adv/ratio_final_to_reasoning": 0.9807542466049668, "adv/ratio_step_to_reasoning": 0.9675932865518027, "adv/std_final_conf": 0.4031435549259186, "adv/std_reasoning": 0.40566515922546387, "adv/std_step_conf": 0.40525007247924805, "calib/answer_extract_rate": 0.06640625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.11328125, "calib/ece": 0.7727272727272726, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.1060000000000001, "calib/mean_conf": 0.8636363636363636, "calib/mu_c": 0.96, "calib/mu_w": 0.8539999999999999, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.7727272727272726, "calib/std_conf": 0.2735381883684027, "calib/step_conf_rate": 0.05078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 623.82421875, "completions/mean_terminated_length": 679.5701904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.09386666666666667, "grad_norm": 0.15347856283187866, "learning_rate": 3.1111111111111116e-06, "loss": -0.8374, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0007594419876113534, "mask/share_reasoning": 0.9156370162963867, "mask/share_step_conf": 0.0015722834505140781, "num_tokens": 23632841.0, "reward": 0.0326203815639019, "reward_std": 0.07294408977031708, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010305078700184822, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.013213226571679115, "step": 88 }, { "adv/mean_abs_final_conf": 0.056992340832948685, "adv/mean_abs_reasoning": 0.09663102775812149, "adv/mean_abs_step_conf": 0.057933270931243896, "adv/ratio_final_to_reasoning": 0.5897933837111515, "adv/ratio_step_to_reasoning": 0.5995307332988065, "adv/std_final_conf": 0.28197088837623596, "adv/std_reasoning": 0.3703117072582245, "adv/std_step_conf": 0.2866179347038269, "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.7500000000000001, "calib/avg_num_step_conf": 0.0546875, "calib/ece": 0.7275992063492063, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.11613425925925924, "calib/mean_conf": 0.8704563492063492, "calib/mu_c": 0.97, "calib/mu_w": 0.8538657407407407, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.7275992063492063, "calib/std_conf": 0.2331131178592595, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 630.01171875, "completions/mean_terminated_length": 658.2979125976562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.09493333333333333, "grad_norm": 0.13982361555099487, "learning_rate": 3.0833333333333336e-06, "loss": -0.4437, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0010558613575994968, "mask/share_reasoning": 0.9537944793701172, "mask/share_step_conf": 0.0021809404715895653, "num_tokens": 23903012.0, "reward": 0.017247222363948822, "reward_std": 0.04878251254558563, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0008829089347273111, "rewards/format_reward_step": 0.01171875, "rewards/stepwise_brier_reward": 0.006709239911288023, "step": 89 }, { "adv/mean_abs_final_conf": 0.038533568382263184, "adv/mean_abs_reasoning": 0.05798481032252312, "adv/mean_abs_step_conf": 0.03862389177083969, "adv/ratio_final_to_reasoning": 0.6645459072459109, "adv/ratio_step_to_reasoning": 0.6661036149985812, "adv/std_final_conf": 0.23348626494407654, "adv/std_reasoning": 0.28687289357185364, "adv/std_step_conf": 0.23403292894363403, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.6875, "calib/avg_num_step_conf": 0.109375, "calib/ece": 0.46000000000000013, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.25749999999999995, "calib/mean_conf": 0.7933333333333334, "calib/mu_c": 0.965, "calib/mu_w": 0.7075, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.46000000000000013, "calib/std_conf": 0.35560589921365976, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 604.52734375, "completions/mean_terminated_length": 644.8292236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.096, "grad_norm": 0.11000526696443558, "learning_rate": 3.055555555555556e-06, "loss": -0.2259, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0015883477171882987, "mask/share_reasoning": 0.9321364164352417, "mask/share_step_conf": 0.0037752282805740833, "num_tokens": 24161091.0, "reward": 0.015415811911225319, "reward_std": 0.04360250011086464, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.004642187617719173, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.005073030013591051, "step": 90 }, { "adv/mean_abs_final_conf": 0.19097426533699036, "adv/mean_abs_reasoning": 0.1932547688484192, "adv/mean_abs_step_conf": 0.19305163621902466, "adv/ratio_final_to_reasoning": 0.9881994968350946, "adv/ratio_step_to_reasoning": 0.9989488868471139, "adv/std_final_conf": 0.5175157785415649, "adv/std_reasoning": 0.5236801505088806, "adv/std_step_conf": 0.5231298208236694, "calib/answer_extract_rate": 0.0703125, "calib/auroc": 0.6875, "calib/avg_num_step_conf": 0.140625, "calib/ece": 0.802128427128427, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.023350168350168343, "calib/mean_conf": 0.9449855699855699, "calib/mu_c": 0.965, "calib/mu_w": 0.9416498316498316, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.802128427128427, "calib/std_conf": 0.048943040131846964, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 616.54296875, "completions/mean_terminated_length": 646.86474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.09706666666666666, "grad_norm": 0.1311177760362625, "learning_rate": 3.0277777777777776e-06, "loss": -1.1402, "mask/has_final_conf_rate": 0.0546875, "mask/share_final_conf": 0.001614823006093502, "mask/share_reasoning": 0.9477262496948242, "mask/share_step_conf": 0.003783911233767867, "num_tokens": 24426638.0, "reward": 0.0390644446015358, "reward_std": 0.11049094051122665, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010537873953580856, "rewards/format_reward_step": 0.0390625, "rewards/stepwise_brier_reward": 0.018172457814216614, "step": 91 }, { "adv/mean_abs_final_conf": 0.11584252864122391, "adv/mean_abs_reasoning": 0.12452749907970428, "adv/mean_abs_step_conf": 0.12619081139564514, "adv/ratio_final_to_reasoning": 0.9302566059491685, "adv/ratio_step_to_reasoning": 1.0133569880406597, "adv/std_final_conf": 0.4036062955856323, "adv/std_reasoning": 0.4056704044342041, "adv/std_step_conf": 0.40536797046661377, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.8928571428571428, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.5548472222222224, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.23881249999999998, "calib/mean_conf": 0.7770694444444444, "calib/mu_c": 0.9628125, "calib/mu_w": 0.724, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.1015625, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.5548472222222224, "calib/std_conf": 0.33366705747932857, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 520.86328125, "completions/mean_terminated_length": 546.4794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.09813333333333334, "grad_norm": 0.11405057460069656, "learning_rate": 3e-06, "loss": -0.8509, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0007968221325427294, "mask/share_reasoning": 0.949183464050293, "mask/share_step_conf": 0.0031446926295757294, "num_tokens": 24666699.0, "reward": 0.036815397441387177, "reward_std": 0.08702380955219269, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.016938727349042892, "rewards/format_reward_step": 0.03125, "rewards/stepwise_brier_reward": 0.01828642562031746, "step": 92 }, { "adv/mean_abs_final_conf": 0.09549635648727417, "adv/mean_abs_reasoning": 0.10031682252883911, "adv/mean_abs_step_conf": 0.10555162280797958, "adv/ratio_final_to_reasoning": 0.9519475804750579, "adv/ratio_step_to_reasoning": 1.0521826763166824, "adv/std_final_conf": 0.3664848506450653, "adv/std_reasoning": 0.3703256845474243, "adv/std_step_conf": 0.37017467617988586, "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.2777777777777778, "calib/avg_num_step_conf": 0.12890625, "calib/ece": 0.6905539772727272, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.08265624999999999, "calib/mean_conf": 0.872372159090909, "calib/mu_c": 0.94, "calib/mu_w": 0.85734375, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.6905539772727272, "calib/std_conf": 0.27654168850000704, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 682.49609375, "completions/mean_terminated_length": 710.2398071289062, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0992, "grad_norm": 0.08944234251976013, "learning_rate": 2.9722222222222225e-06, "loss": -0.4989, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0006196058820933104, "mask/share_reasoning": 0.9587451219558716, "mask/share_step_conf": 0.001572765177115798, "num_tokens": 24947194.0, "reward": 0.03515958413481712, "reward_std": 0.07899127155542374, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.009003906510770321, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.0228484645485878, "step": 93 }, { "adv/mean_abs_final_conf": 0.1211947426199913, "adv/mean_abs_reasoning": 0.12562128901481628, "adv/mean_abs_step_conf": 0.12621799111366272, "adv/ratio_final_to_reasoning": 0.964762768878268, "adv/ratio_step_to_reasoning": 1.0047500077695912, "adv/std_final_conf": 0.4039490818977356, "adv/std_reasoning": 0.40567031502723694, "adv/std_step_conf": 0.4053996205329895, "calib/answer_extract_rate": 0.0625, "calib/auroc": 0.6944444444444444, "calib/avg_num_step_conf": 0.1171875, "calib/ece": 0.6869960907508896, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.1175603335266906, "calib/mean_conf": 0.8688142725690714, "calib/mu_c": 0.9650000000000001, "calib/mu_w": 0.8474396664733095, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.6869960907508896, "calib/std_conf": 0.2026797791513814, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 542.26171875, "completions/mean_terminated_length": 571.2716064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.10026666666666667, "grad_norm": 0.12918737530708313, "learning_rate": 2.944444444444445e-06, "loss": -0.9216, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0013458357425406575, "mask/share_reasoning": 0.9453219175338745, "mask/share_step_conf": 0.002550976350903511, "num_tokens": 25194693.0, "reward": 0.040408436208963394, "reward_std": 0.08953796327114105, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.015550907701253891, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.022260168567299843, "step": 94 }, { "adv/mean_abs_final_conf": 0.10610508918762207, "adv/mean_abs_reasoning": 0.1279323250055313, "adv/mean_abs_step_conf": 0.1084851622581482, "adv/ratio_final_to_reasoning": 0.8293845139063524, "adv/ratio_step_to_reasoning": 0.847988670990367, "adv/std_final_conf": 0.3683347702026367, "adv/std_reasoning": 0.4057179093360901, "adv/std_step_conf": 0.37018951773643494, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.28125, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.5637790697674417, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": -0.17744186046511623, "calib/mean_conf": 0.8637790697674419, "calib/mu_c": 0.7750581395348837, "calib/mu_w": 0.9524999999999999, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.46377906976744177, "calib/std_conf": 0.16117473486730388, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 551.0625, "completions/mean_terminated_length": 595.240478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.10133333333333333, "grad_norm": 0.12042226642370224, "learning_rate": 2.916666666666667e-06, "loss": -0.678, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.001040589064359665, "mask/share_reasoning": 0.9197874069213867, "mask/share_step_conf": 0.004953281953930855, "num_tokens": 25441893.0, "reward": 0.04625809192657471, "reward_std": 0.10904739797115326, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.015411455184221268, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.01840420439839363, "step": 95 }, { "adv/mean_abs_final_conf": 0.17711566388607025, "adv/mean_abs_reasoning": 0.22767430543899536, "adv/mean_abs_step_conf": 0.196292906999588, "adv/ratio_final_to_reasoning": 0.7779343547114843, "adv/ratio_step_to_reasoning": 0.8621653928892038, "adv/std_final_conf": 0.4906613528728485, "adv/std_reasoning": 0.5492765307426453, "adv/std_step_conf": 0.5232849717140198, "calib/answer_extract_rate": 0.078125, "calib/auroc": 0.3214285714285714, "calib/avg_num_step_conf": 0.171875, "calib/ece": 0.7791931115366915, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.04296875, "calib/frac_conf_gt_0.9": 0.9375, "calib/gap": 0.04377930110206696, "calib/mean_conf": 0.9041931115366915, "calib/mu_c": 0.942500000001, "calib/mu_w": 0.898720698898933, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.09375, "calib/nonempty_step_conf_rate": 0.07421875, "calib/pce": 0.7791931115366915, "calib/std_conf": 0.22850792172641496, "calib/step_conf_rate": 0.07421875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 594.328125, "completions/mean_terminated_length": 626.1234130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1024, "grad_norm": 0.14036454260349274, "learning_rate": 2.888888888888889e-06, "loss": -1.163, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.001665796386078, "mask/share_reasoning": 0.9442043900489807, "mask/share_step_conf": 0.0033485221210867167, "num_tokens": 25699857.0, "reward": 0.05740179494023323, "reward_std": 0.13414423167705536, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.01009805966168642, "rewards/format_reward_step": 0.04296875, "rewards/stepwise_brier_reward": 0.027723312377929688, "step": 96 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.9375, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9375, "calib/mu_c": NaN, "calib/mu_w": 0.9375, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9375, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 566.63671875, "completions/mean_terminated_length": 596.9506225585938, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.10346666666666667, "grad_norm": 0.0, "learning_rate": 2.861111111111111e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 9.337649680674076e-05, "mask/share_reasoning": 0.9488027691841125, "mask/share_step_conf": 0.00032261834712699056, "num_tokens": 25949988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 97 }, { "adv/mean_abs_final_conf": 0.03821493685245514, "adv/mean_abs_reasoning": 0.07731429487466812, "adv/mean_abs_step_conf": 0.038617927581071854, "adv/ratio_final_to_reasoning": 0.49428035157540046, "adv/ratio_step_to_reasoning": 0.4994927218009841, "adv/std_final_conf": 0.2315683364868164, "adv/std_reasoning": 0.3312574625015259, "adv/std_step_conf": 0.23399679362773895, "calib/answer_extract_rate": 0.03125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.0859375, "calib/ece": 0.3182323232323234, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.014848484848484889, "calib/mean_conf": 0.98489898989899, "calib/mu_c": 0.989848484848485, "calib/mu_w": 0.9750000000000001, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.3182323232323234, "calib/std_conf": 0.010848832829285312, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 631.0390625, "completions/mean_terminated_length": 664.79833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.10453333333333334, "grad_norm": 0.05403328686952591, "learning_rate": 2.8333333333333335e-06, "loss": -0.2989, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0006696260534226894, "mask/share_reasoning": 0.9450562000274658, "mask/share_step_conf": 0.0034929136745631695, "num_tokens": 26217718.0, "reward": 0.018742192536592484, "reward_std": 0.05301092937588692, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.004099121317267418, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.004184824880212545, "step": 98 }, { "adv/mean_abs_final_conf": 0.038170330226421356, "adv/mean_abs_reasoning": 0.057977523654699326, "adv/mean_abs_step_conf": 0.0386352613568306, "adv/ratio_final_to_reasoning": 0.6583642732613932, "adv/ratio_step_to_reasoning": 0.6663834348450832, "adv/std_final_conf": 0.23128490149974823, "adv/std_reasoning": 0.2868368327617645, "adv/std_step_conf": 0.234101802110672, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.06640625, "calib/ece": 0.955204081632653, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.955204081632653, "calib/mu_c": NaN, "calib/mu_w": 0.955204081632653, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.955204081632653, "calib/std_conf": 0.005204081632653068, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 580.92578125, "completions/mean_terminated_length": 624.8613891601562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1056, "grad_norm": 0.055600475519895554, "learning_rate": 2.805555555555556e-06, "loss": -0.3092, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0002980417921207845, "mask/share_reasoning": 0.9283925294876099, "mask/share_step_conf": 0.0009968822123482823, "num_tokens": 26472235.0, "reward": 0.010449407622218132, "reward_std": 0.029555387794971466, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0006840474670752883, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.004931790754199028, "step": 99 }, { "adv/mean_abs_final_conf": 0.03767964243888855, "adv/mean_abs_reasoning": 0.057977523654699326, "adv/mean_abs_step_conf": 0.03859805315732956, "adv/ratio_final_to_reasoning": 0.6499008592243394, "adv/ratio_step_to_reasoning": 0.6657416654635097, "adv/std_final_conf": 0.2283170223236084, "adv/std_reasoning": 0.2868368327617645, "adv/std_step_conf": 0.23387649655342102, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.9740130289711159, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9740130289711159, "calib/mu_c": NaN, "calib/mu_w": 0.9740130289711159, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.9740130289711159, "calib/std_conf": 0.011860412005284073, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 586.30078125, "completions/mean_terminated_length": 612.6244506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.10666666666666667, "grad_norm": 0.05185471847653389, "learning_rate": 2.7777777777777783e-06, "loss": -0.2999, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0013168000150471926, "mask/share_reasoning": 0.9537020921707153, "mask/share_step_conf": 0.002012330573052168, "num_tokens": 26729736.0, "reward": 0.010003788396716118, "reward_std": 0.028294987976551056, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00036270637065172195, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.004201224073767662, "step": 100 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.4219834728020131, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.4219834728020131, "calib/mu_c": NaN, "calib/mu_w": 0.4219834728020131, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.4219834728020131, "calib/std_conf": 0.4007358049345676, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 577.13671875, "completions/mean_terminated_length": 613.05810546875, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.10773333333333333, "grad_norm": 0.000803845701739192, "learning_rate": 2.7500000000000004e-06, "loss": -0.0019, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0013398993760347366, "mask/share_reasoning": 0.9395236968994141, "mask/share_step_conf": 0.0005426329444162548, "num_tokens": 26984475.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 101 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.05078125, "calib/ece": 0.9586917106662725, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9586917106662725, "calib/mu_c": NaN, "calib/mu_w": 0.9586917106662725, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.9586917106662725, "calib/std_conf": 0.011406376782531445, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 568.7265625, "completions/mean_terminated_length": 614.3206176757812, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.1088, "grad_norm": 0.0, "learning_rate": 2.7222222222222224e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0012479554861783981, "mask/share_reasoning": 0.9224805235862732, "mask/share_step_conf": 0.002052756492048502, "num_tokens": 27236765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 102 }, { "adv/mean_abs_final_conf": 0.038129813969135284, "adv/mean_abs_reasoning": 0.03864803910255432, "adv/mean_abs_step_conf": 0.03862028568983078, "adv/ratio_final_to_reasoning": 0.9865911661897282, "adv/ratio_step_to_reasoning": 0.9992818933801558, "adv/std_final_conf": 0.23103944957256317, "adv/std_reasoning": 0.23417921364307404, "adv/std_step_conf": 0.23401108384132385, "calib/answer_extract_rate": 0.05859375, "calib/avg_num_step_conf": 0.078125, "calib/ece": 0.9624061285918764, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9624061285918764, "calib/mu_c": NaN, "calib/mu_w": 0.9624061285918764, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.9624061285918764, "calib/std_conf": 0.0068011903033952095, "calib/step_conf_rate": 0.04296875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 604.109375, "completions/mean_terminated_length": 623.5967407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.10986666666666667, "grad_norm": 0.05884866788983345, "learning_rate": 2.6944444444444444e-06, "loss": -0.3036, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0016500115161761642, "mask/share_reasoning": 0.9648259878158569, "mask/share_step_conf": 0.0022740354761481285, "num_tokens": 27495969.0, "reward": 0.006301497109234333, "reward_std": 0.017823325470089912, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0006331427721306682, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.004473922774195671, "step": 103 }, { "adv/mean_abs_final_conf": 0.01893104799091816, "adv/mean_abs_reasoning": 0.038653504103422165, "adv/mean_abs_step_conf": 0.01924740895628929, "adv/ratio_final_to_reasoning": 0.4897627894295387, "adv/ratio_step_to_reasoning": 0.4979473246407493, "adv/std_final_conf": 0.16222229599952698, "adv/std_reasoning": 0.23421232402324677, "adv/std_step_conf": 0.1649332195520401, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.70578125, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.03437499999999982, "calib/mean_conf": 0.95578125, "calib/mu_c": 0.9300000000000002, "calib/mu_w": 0.964375, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.70578125, "calib/std_conf": 0.01732825884754432, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 649.31640625, "completions/mean_terminated_length": 689.7303466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.11093333333333333, "grad_norm": 0.054028674960136414, "learning_rate": 2.666666666666667e-06, "loss": -0.1354, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0006737759103998542, "mask/share_reasoning": 0.9399489164352417, "mask/share_step_conf": 0.0007835605647414923, "num_tokens": 27768874.0, "reward": 0.006154080852866173, "reward_std": 0.017406370490789413, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00020713958656415343, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.00048584199976176023, "step": 104 }, { "adv/mean_abs_final_conf": 0.03664717078208923, "adv/mean_abs_reasoning": 0.03864803910255432, "adv/mean_abs_step_conf": 0.03860872983932495, "adv/ratio_final_to_reasoning": 0.948228464705397, "adv/ratio_step_to_reasoning": 0.9989828911338797, "adv/std_final_conf": 0.22227756679058075, "adv/std_reasoning": 0.23417921364307404, "adv/std_step_conf": 0.23394106328487396, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.03125, "calib/ece": 0.9681625662625422, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9681625662625422, "calib/mu_c": NaN, "calib/mu_w": 0.9681625662625422, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.9681625662625422, "calib/std_conf": 0.026415746987534794, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 580.0, "completions/mean_terminated_length": 611.02880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.112, "grad_norm": 0.11139997094869614, "learning_rate": 2.6388888888888893e-06, "loss": -0.2799, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.000729624240193516, "mask/share_reasoning": 0.9466995000839233, "mask/share_step_conf": 0.0017896032659336925, "num_tokens": 28023114.0, "reward": 0.005710248835384846, "reward_std": 0.016151022166013718, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00048408948350697756, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.00336595275439322, "step": 105 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.9900000000000001, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9900000000000001, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 649.3984375, "completions/mean_terminated_length": 678.5550537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.11306666666666666, "grad_norm": 0.0, "learning_rate": 2.6111111111111113e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0001707650226308033, "mask/share_reasoning": 0.9564396142959595, "mask/share_step_conf": 0.0004209116450510919, "num_tokens": 28293944.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 106 }, { "adv/mean_abs_final_conf": 0.018899209797382355, "adv/mean_abs_reasoning": 0.01932401955127716, "adv/mean_abs_step_conf": 0.019282689318060875, "adv/ratio_final_to_reasoning": 0.978016491198037, "adv/ratio_step_to_reasoning": 0.9978611989546681, "adv/std_final_conf": 0.16194945573806763, "adv/std_reasoning": 0.16558970510959625, "adv/std_step_conf": 0.16523553431034088, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.9751562500000001, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9751562500000001, "calib/mu_c": NaN, "calib/mu_w": 0.9751562500000001, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.9751562500000001, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 603.23828125, "completions/mean_terminated_length": 638.1363525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.11413333333333334, "grad_norm": 0.02593773417174816, "learning_rate": 2.5833333333333337e-06, "loss": -0.1541, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00016640975081827492, "mask/share_reasoning": 0.944463849067688, "mask/share_step_conf": 0.0006822873838245869, "num_tokens": 28552989.0, "reward": 0.0024086986668407917, "reward_std": 0.006812828592956066, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00019168081053067, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.0008153068483807147, "step": 107 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.612890625, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.612890625, "calib/mu_c": NaN, "calib/mu_w": 0.612890625, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.612890625, "calib/std_conf": 0.31289062500000003, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 627.7890625, "completions/mean_terminated_length": 658.6638793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1152, "grad_norm": 0.0, "learning_rate": 2.5555555555555557e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.001333074877038598, "mask/share_reasoning": 0.951507568359375, "mask/share_step_conf": 0.000284366135019809, "num_tokens": 28816935.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 108 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.02734375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 550.63671875, "completions/mean_terminated_length": 582.49169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.11626666666666667, "grad_norm": 0.0007484604720957577, "learning_rate": 2.5277777777777778e-06, "loss": -0.0232, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9448825120925903, "mask/share_step_conf": 0.0004299637221265584, "num_tokens": 29062498.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 109 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.9771875, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9771875, "calib/mu_c": NaN, "calib/mu_w": 0.9771875, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.9771875, "calib/std_conf": 0.007187500000000013, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 583.64453125, "completions/mean_terminated_length": 612.3483276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.11733333333333333, "grad_norm": 0.0, "learning_rate": 2.5e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0005077360547147691, "mask/share_reasoning": 0.9521244764328003, "mask/share_step_conf": 0.0004928014823235571, "num_tokens": 29316831.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 110 }, { "adv/mean_abs_final_conf": 0.038073450326919556, "adv/mean_abs_reasoning": 0.03864803910255432, "adv/mean_abs_step_conf": 0.038613706827163696, "adv/ratio_final_to_reasoning": 0.9851327832154675, "adv/ratio_step_to_reasoning": 0.9991116683747001, "adv/std_final_conf": 0.23070098459720612, "adv/std_reasoning": 0.23417921364307404, "adv/std_step_conf": 0.23397119343280792, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.0390625, "calib/ece": 0.9636030726162881, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9636030726162881, "calib/mu_c": NaN, "calib/mu_w": 0.9636030726162881, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.9636030726162881, "calib/std_conf": 0.01469339308189049, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 629.72265625, "completions/mean_terminated_length": 655.3211059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1184, "grad_norm": 0.03367357701063156, "learning_rate": 2.4722222222222226e-06, "loss": -0.2922, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0008814925095066428, "mask/share_reasoning": 0.9582992792129517, "mask/share_step_conf": 0.0017567335162311792, "num_tokens": 29585448.0, "reward": 0.005740383639931679, "reward_std": 0.016236256808042526, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0006421275902539492, "rewards/format_reward_step": 0.0078125, "rewards/stepwise_brier_reward": 0.003347203601151705, "step": 111 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 691.02734375, "completions/mean_terminated_length": 731.0040893554688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.11946666666666667, "grad_norm": 0.0, "learning_rate": 2.4444444444444447e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.943239688873291, "mask/share_step_conf": 0.0020727741066366434, "num_tokens": 29870271.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 112 }, { "adv/mean_abs_final_conf": 0.018980545923113823, "adv/mean_abs_reasoning": 0.01932401955127716, "adv/mean_abs_step_conf": 0.01931268535554409, "adv/ratio_final_to_reasoning": 0.9822255598918271, "adv/ratio_step_to_reasoning": 0.9994134659353353, "adv/std_final_conf": 0.16264642775058746, "adv/std_reasoning": 0.16558970510959625, "adv/std_step_conf": 0.16549257934093475, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.9697435897435898, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9697435897435898, "calib/mu_c": NaN, "calib/mu_w": 0.9697435897435898, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.9697435897435898, "calib/std_conf": 0.0003626188621468834, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 615.72265625, "completions/mean_terminated_length": 643.3673095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.12053333333333334, "grad_norm": 0.02239268273115158, "learning_rate": 2.4166666666666667e-06, "loss": -0.1121, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0005311921122483909, "mask/share_reasoning": 0.9556958675384521, "mask/share_step_conf": 0.0008042484987527132, "num_tokens": 30133096.0, "reward": 0.00297063821926713, "reward_std": 0.008402233012020588, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00023668639187235385, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.0019166830461472273, "step": 113 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.02734375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 632.4140625, "completions/mean_terminated_length": 663.516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1216, "grad_norm": 0.0, "learning_rate": 2.388888888888889e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9525699615478516, "mask/share_step_conf": 0.0005550722125917673, "num_tokens": 30400018.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 114 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.05078125, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.9500000000000001, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9500000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.9500000000000001, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9500000000000001, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 637.1953125, "completions/mean_terminated_length": 691.1949462890625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.12266666666666666, "grad_norm": 0.0, "learning_rate": 2.361111111111111e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.001953125, "mask/share_reasoning": 0.9198802709579468, "mask/share_step_conf": 4.1567218431737274e-05, "num_tokens": 30668404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 115 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0390625, "calib/avg_num_step_conf": 0.02734375, "calib/ece": 0.9651347030478711, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9651347030478711, "calib/mu_c": NaN, "calib/mu_w": 0.9651347030478711, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.9651347030478711, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 646.5546875, "completions/mean_terminated_length": 681.1439819335938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.12373333333333333, "grad_norm": 0.0, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00024069320352282375, "mask/share_reasoning": 0.9482711553573608, "mask/share_step_conf": 0.0007068718550726771, "num_tokens": 30938442.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 116 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.03125, "calib/ece": 0.9606285696295394, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9606285696295394, "calib/mu_c": NaN, "calib/mu_w": 0.9606285696295394, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.9606285696295394, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 643.56640625, "completions/mean_terminated_length": 686.4708862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1248, "grad_norm": 0.0, "learning_rate": 2.305555555555556e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00019975141913164407, "mask/share_reasoning": 0.9363988041877747, "mask/share_step_conf": 0.0009014662355184555, "num_tokens": 31209795.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 117 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2917.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 579.546875, "completions/mean_terminated_length": 620.7698364257812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.12586666666666665, "grad_norm": 0.0005588608328253031, "learning_rate": 2.277777777777778e-06, "loss": -0.0076, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9332329034805298, "mask/share_step_conf": 0.00036084410385228693, "num_tokens": 31462167.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 118 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 628.5234375, "completions/mean_terminated_length": 659.4343872070312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.12693333333333334, "grad_norm": 0.0, "learning_rate": 2.25e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0003443797177169472, "mask/share_reasoning": 0.9520903825759888, "mask/share_step_conf": 0.0006902526365593076, "num_tokens": 31728133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 119 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 563.96484375, "completions/mean_terminated_length": 604.0794677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.128, "grad_norm": 0.0, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9334514737129211, "mask/share_step_conf": 0.0001422833011019975, "num_tokens": 31979196.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 120 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 626.875, "completions/mean_terminated_length": 660.4114990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.12906666666666666, "grad_norm": 0.0, "learning_rate": 2.1944444444444445e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9491629600524902, "mask/share_step_conf": 5.580885408562608e-05, "num_tokens": 32244732.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 121 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.9659777386006246, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9659777386006246, "calib/mu_c": NaN, "calib/mu_w": 0.9659777386006246, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.9659777386006246, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 575.4375, "completions/mean_terminated_length": 629.5385131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.13013333333333332, "grad_norm": 0.0, "learning_rate": 2.166666666666667e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0003627232217695564, "mask/share_reasoning": 0.9131672382354736, "mask/share_step_conf": 0.0005325586535036564, "num_tokens": 32499388.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 122 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 637.34765625, "completions/mean_terminated_length": 668.6925659179688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1312, "grad_norm": 0.0, "learning_rate": 2.138888888888889e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9520647525787354, "mask/share_step_conf": 0.00106026791036129, "num_tokens": 32767837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 123 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 561.5703125, "completions/mean_terminated_length": 601.5146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.13226666666666667, "grad_norm": 0.0, "learning_rate": 2.1111111111111114e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9335083961486816, "mask/share_step_conf": 8.535483357263729e-05, "num_tokens": 33018415.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 124 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 659.74609375, "completions/mean_terminated_length": 703.7291870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.13333333333333333, "grad_norm": 0.0, "learning_rate": 2.0833333333333334e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9373856782913208, "mask/share_step_conf": 0.00011434726184234023, "num_tokens": 33292118.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 125 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2852.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 635.34375, "completions/mean_terminated_length": 674.8880004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.1344, "grad_norm": 0.0, "learning_rate": 2.0555555555555555e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9414001703262329, "mask/share_step_conf": 6.056201527826488e-06, "num_tokens": 33560230.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 126 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 663.05078125, "completions/mean_terminated_length": 719.2415161132812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.13546666666666668, "grad_norm": 0.0006136018200777471, "learning_rate": 2.027777777777778e-06, "loss": -0.0093, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9217934012413025, "mask/share_step_conf": 8.160474681062624e-05, "num_tokens": 33833643.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 127 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 627.9453125, "completions/mean_terminated_length": 661.5390625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.13653333333333334, "grad_norm": 0.0, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.94921875, "mask/share_step_conf": 0.0, "num_tokens": 34101061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 128 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.02734375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 692.4921875, "completions/mean_terminated_length": 723.5836181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1376, "grad_norm": 0.0, "learning_rate": 1.9722222222222224e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9545925855636597, "mask/share_step_conf": 0.0024386178702116013, "num_tokens": 34380723.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 129 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 536.00390625, "completions/mean_terminated_length": 581.427978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.13866666666666666, "grad_norm": 0.0, "learning_rate": 1.944444444444445e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9218621253967285, "mask/share_step_conf": 1.2865376447734889e-05, "num_tokens": 34623228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 130 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 613.83984375, "completions/mean_terminated_length": 652.045654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.13973333333333332, "grad_norm": 0.0, "learning_rate": 1.916666666666667e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9411472082138062, "mask/share_step_conf": 0.0002590351505205035, "num_tokens": 34886579.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 131 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 660.9765625, "completions/mean_terminated_length": 705.0416870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1408, "grad_norm": 0.0, "learning_rate": 1.888888888888889e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9372943639755249, "mask/share_step_conf": 0.00020565465092658997, "num_tokens": 35161381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 132 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.00390625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 708.54296875, "completions/mean_terminated_length": 740.3550415039062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.14186666666666667, "grad_norm": 0.0, "learning_rate": 1.8611111111111113e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.956986129283905, "mask/share_step_conf": 4.512997475103475e-05, "num_tokens": 35449112.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 133 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0078125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 702.57421875, "completions/mean_terminated_length": 762.1143798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.14293333333333333, "grad_norm": 0.0, "learning_rate": 1.8333333333333333e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9218196272850037, "mask/share_step_conf": 5.537808465305716e-05, "num_tokens": 35737923.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 134 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 672.32421875, "completions/mean_terminated_length": 708.2921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.144, "grad_norm": 0.0005483909044414759, "learning_rate": 1.8055555555555557e-06, "loss": -0.0099, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9491338729858398, "mask/share_step_conf": 8.486342267133296e-05, "num_tokens": 36015918.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 135 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.975, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.975, "calib/mu_c": NaN, "calib/mu_w": 0.975, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.975, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 612.83203125, "completions/mean_terminated_length": 653.6875610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.14506666666666668, "grad_norm": 0.0, "learning_rate": 1.777777777777778e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 5.321866410668008e-05, "mask/share_reasoning": 0.9371010661125183, "mask/share_step_conf": 0.0003457071434240788, "num_tokens": 36281291.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 136 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0078125, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 668.94140625, "completions/mean_terminated_length": 701.8401489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.14613333333333334, "grad_norm": 0.0, "learning_rate": 1.75e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9511955976486206, "mask/share_step_conf": 0.001929364399984479, "num_tokens": 36559524.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 137 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 593.125, "completions/mean_terminated_length": 624.85595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1472, "grad_norm": 0.0, "learning_rate": 1.7222222222222224e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.948685884475708, "mask/share_step_conf": 0.000532844103872776, "num_tokens": 36815700.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 138 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 599.9375, "completions/mean_terminated_length": 639.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.14826666666666666, "grad_norm": 0.0, "learning_rate": 1.6944444444444446e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9371291399002075, "mask/share_step_conf": 0.0003708239528350532, "num_tokens": 37072380.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 139 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 653.01953125, "completions/mean_terminated_length": 685.1351928710938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.14933333333333335, "grad_norm": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9528490900993347, "mask/share_step_conf": 0.0002759067574515939, "num_tokens": 37344569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 140 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 711.59765625, "completions/mean_terminated_length": 755.8880004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1504, "grad_norm": 0.0, "learning_rate": 1.638888888888889e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9413928985595703, "mask/share_step_conf": 1.3360410775931086e-05, "num_tokens": 37633834.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 141 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 685.59765625, "completions/mean_terminated_length": 731.30419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.15146666666666667, "grad_norm": 0.0, "learning_rate": 1.6111111111111113e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9340871572494507, "mask/share_step_conf": 0.003412847174331546, "num_tokens": 37914507.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 142 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 656.6953125, "completions/mean_terminated_length": 691.8271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.15253333333333333, "grad_norm": 0.0, "learning_rate": 1.5833333333333333e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9482780694961548, "mask/share_step_conf": 0.0009406713652424514, "num_tokens": 38189957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 143 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.9940721006120699, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9940721006120699, "calib/mu_c": NaN, "calib/mu_w": 0.9940721006120699, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9940721006120699, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 656.30859375, "completions/mean_terminated_length": 700.0625610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1536, "grad_norm": 0.001189550501294434, "learning_rate": 1.5555555555555558e-06, "loss": -0.0069, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0003620766510721296, "mask/share_reasoning": 0.93581223487854, "mask/share_step_conf": 0.0013256651582196355, "num_tokens": 38462100.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 144 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.02734375, "calib/ece": 0.96, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.96, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 635.6953125, "completions/mean_terminated_length": 669.7036743164062, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.15466666666666667, "grad_norm": 0.0, "learning_rate": 1.527777777777778e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 8.281802001874894e-05, "mask/share_reasoning": 0.9487344026565552, "mask/share_step_conf": 0.00040147791150957346, "num_tokens": 38727542.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 145 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 636.84375, "completions/mean_terminated_length": 679.300048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.15573333333333333, "grad_norm": 0.0, "learning_rate": 1.5e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9373124837875366, "mask/share_step_conf": 0.0001875000016298145, "num_tokens": 38997790.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 146 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.0, "calib/ece": 0.9807894736842104, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9807894736842104, "calib/mu_c": NaN, "calib/mu_w": 0.9807894736842104, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.0078125, "calib/nonempty_step_conf_rate": 0.0, "calib/pce": 0.9807894736842104, "calib/std_conf": 0.019210526315789234, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 636.046875, "completions/mean_terminated_length": 659.2227172851562, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1568, "grad_norm": 0.0, "learning_rate": 1.4722222222222225e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00044946392881684005, "mask/share_reasoning": 0.9642276763916016, "mask/share_step_conf": 0.00016664052964188159, "num_tokens": 39264298.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 147 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 586.65234375, "completions/mean_terminated_length": 620.5908813476562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.15786666666666666, "grad_norm": 0.0, "learning_rate": 1.4444444444444445e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9447327852249146, "mask/share_step_conf": 0.0005797140765935183, "num_tokens": 39519593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 148 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.03865896910429001, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.2342454344034195, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 691.40234375, "completions/mean_terminated_length": 737.495849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.15893333333333334, "grad_norm": 0.0006965881329961121, "learning_rate": 1.4166666666666667e-06, "loss": -0.0201, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9373958110809326, "mask/share_step_conf": 0.00010416324221296236, "num_tokens": 39801048.0, "reward": 0.0078125, "reward_std": 0.022097086533904076, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 149 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 619.40234375, "completions/mean_terminated_length": 655.2355346679688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 1.3888888888888892e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9431923031806946, "mask/share_step_conf": 0.002120216842740774, "num_tokens": 40064575.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 150 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.94, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.94, "calib/mu_c": NaN, "calib/mu_w": 0.94, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.94, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 592.2265625, "completions/mean_terminated_length": 639.70458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16106666666666666, "grad_norm": 0.0, "learning_rate": 1.3611111111111112e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 4.515895852819085e-05, "mask/share_reasoning": 0.9251866340637207, "mask/share_step_conf": 0.0005494383512996137, "num_tokens": 40323209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 151 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.9842106360885977, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9842106360885977, "calib/mu_c": NaN, "calib/mu_w": 0.9842106360885977, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.9842106360885977, "calib/std_conf": 0.011988413866375502, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 578.18359375, "completions/mean_terminated_length": 619.3096313476562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16213333333333332, "grad_norm": 0.0, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0008371968287974596, "mask/share_reasoning": 0.9285774827003479, "mask/share_step_conf": 0.004179063253104687, "num_tokens": 40576616.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 152 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 654.390625, "completions/mean_terminated_length": 678.23486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1632, "grad_norm": 0.0, "learning_rate": 1.3055555555555556e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9647525548934937, "mask/share_step_conf": 9.118674643104896e-05, "num_tokens": 40851460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 153 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.015625, "calib/ece": 0.8750000000000002, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.8750000000000002, "calib/mu_c": NaN, "calib/mu_w": 0.8750000000000002, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.015625, "calib/pce": 0.8750000000000002, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 655.62109375, "completions/mean_terminated_length": 674.0521850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16426666666666667, "grad_norm": 0.0, "learning_rate": 1.2777777777777779e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.000894201803021133, "mask/share_reasoning": 0.9707907438278198, "mask/share_step_conf": 0.0009712825412862003, "num_tokens": 41123739.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 154 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0078125, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 560.046875, "completions/mean_terminated_length": 587.5901489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16533333333333333, "grad_norm": 0.0, "learning_rate": 1.25e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.953125, "mask/share_step_conf": 0.0, "num_tokens": 41374327.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 155 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 657.609375, "completions/mean_terminated_length": 704.3849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1664, "grad_norm": 0.0, "learning_rate": 1.2222222222222223e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.931401789188385, "mask/share_step_conf": 0.00219197035767138, "num_tokens": 41647435.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 156 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.015625, "calib/ece": 0.9226582608695653, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9226582608695653, "calib/mu_c": NaN, "calib/mu_w": 0.9226582608695653, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.9226582608695653, "calib/std_conf": 0.002658260869565221, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 774.03515625, "completions/mean_terminated_length": 839.63134765625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.16746666666666668, "grad_norm": 0.0, "learning_rate": 1.1944444444444446e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0017013853648677468, "mask/share_reasoning": 0.918377697467804, "mask/share_step_conf": 0.0017959036631509662, "num_tokens": 41949316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 157 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.03515625, "calib/ece": 0.960828827622024, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.960828827622024, "calib/mu_c": NaN, "calib/mu_w": 0.960828827622024, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.960828827622024, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.03515625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 692.8515625, "completions/mean_terminated_length": 751.5678100585938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16853333333333334, "grad_norm": 0.0, "learning_rate": 1.1666666666666668e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00036790166632272303, "mask/share_reasoning": 0.9196397066116333, "mask/share_step_conf": 0.00186736264731735, "num_tokens": 42231926.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 158 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.9999999999999998, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9999999999999998, "calib/mu_c": NaN, "calib/mu_w": 0.9999999999999998, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9999999999999998, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 576.9140625, "completions/mean_terminated_length": 615.3750610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1696, "grad_norm": 0.0, "learning_rate": 1.138888888888889e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0006256421329453588, "mask/share_reasoning": 0.9368565082550049, "mask/share_step_conf": 1.7836757251643576e-05, "num_tokens": 42484400.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 159 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.02734375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 656.1875, "completions/mean_terminated_length": 688.458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.17066666666666666, "grad_norm": 0.0, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9526559114456177, "mask/share_step_conf": 0.0004691215290222317, "num_tokens": 42757224.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 160 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0390625, "calib/avg_num_step_conf": 0.02734375, "calib/ece": 0.8, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.8, "calib/mu_c": NaN, "calib/mu_w": 0.8, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.8, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 688.9296875, "completions/mean_terminated_length": 728.7850952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.17173333333333332, "grad_norm": 0.00032964005367830396, "learning_rate": 1.0833333333333335e-06, "loss": -0.0086, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00021701389050576836, "mask/share_reasoning": 0.9441901445388794, "mask/share_step_conf": 0.000905350549146533, "num_tokens": 43037510.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 161 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 574.69921875, "completions/mean_terminated_length": 605.4443969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1728, "grad_norm": 0.0, "learning_rate": 1.0555555555555557e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9491317272186279, "mask/share_step_conf": 8.699888712726533e-05, "num_tokens": 43288777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 162 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 560.73828125, "completions/mean_terminated_length": 610.8468017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.17386666666666667, "grad_norm": 0.0, "learning_rate": 1.0277777777777777e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.917188823223114, "mask/share_step_conf": 0.000779931026045233, "num_tokens": 43537158.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 163 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 613.52734375, "completions/mean_terminated_length": 649.0206298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.17493333333333333, "grad_norm": 0.0, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9443371295928955, "mask/share_step_conf": 0.0009753695921972394, "num_tokens": 43800357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 164 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.96, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.96, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 606.296875, "completions/mean_terminated_length": 654.9028930664062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.176, "grad_norm": 0.0, "learning_rate": 9.722222222222224e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 2.861721623048652e-05, "mask/share_reasoning": 0.9257179498672485, "mask/share_step_conf": 3.46535089192912e-05, "num_tokens": 44061145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 165 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.02734375, "calib/ece": 0.9682673675233835, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9682673675233835, "calib/mu_c": NaN, "calib/mu_w": 0.9682673675233835, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.9682673675233835, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 637.0234375, "completions/mean_terminated_length": 679.49169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.17706666666666668, "grad_norm": 0.0005020391545258462, "learning_rate": 9.444444444444445e-07, "loss": 0.0082, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0030303029343485832, "mask/share_reasoning": 0.9339636564254761, "mask/share_step_conf": 0.0005060465191490948, "num_tokens": 44330407.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 166 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 612.921875, "completions/mean_terminated_length": 667.693603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.17813333333333334, "grad_norm": 0.0, "learning_rate": 9.166666666666666e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.917394757270813, "mask/share_step_conf": 0.0005739557673223317, "num_tokens": 44592923.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 167 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 622.71875, "completions/mean_terminated_length": 656.0328979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1792, "grad_norm": 0.0005375173641368747, "learning_rate": 8.88888888888889e-07, "loss": -0.0123, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9491442441940308, "mask/share_step_conf": 7.449730765074492e-05, "num_tokens": 44857011.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 168 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 688.1953125, "completions/mean_terminated_length": 719.0938720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18026666666666666, "grad_norm": 0.0, "learning_rate": 8.611111111111112e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9561343193054199, "mask/share_step_conf": 0.0008969124755822122, "num_tokens": 45137373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 169 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 634.4140625, "completions/mean_terminated_length": 673.9004516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18133333333333335, "grad_norm": 0.0, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.941338062286377, "mask/share_step_conf": 6.818037945777178e-05, "num_tokens": 45403935.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 170 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 555.0390625, "completions/mean_terminated_length": 592.0416870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1824, "grad_norm": 0.0, "learning_rate": 8.055555555555557e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9375, "mask/share_step_conf": 0.0, "num_tokens": 45652921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 171 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 579.78125, "completions/mean_terminated_length": 613.322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18346666666666667, "grad_norm": 0.0, "learning_rate": 7.777777777777779e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9434047937393188, "mask/share_step_conf": 0.0019076891476288438, "num_tokens": 45904697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 172 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 657.28125, "completions/mean_terminated_length": 692.4443969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.18453333333333333, "grad_norm": 0.0, "learning_rate": 7.5e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.948871374130249, "mask/share_step_conf": 0.0003474223776720464, "num_tokens": 46176121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 173 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01171875, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 678.28125, "completions/mean_terminated_length": 723.5000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1856, "grad_norm": 0.0, "learning_rate": 7.222222222222222e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9375, "mask/share_step_conf": 0.0, "num_tokens": 46453993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 174 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.02734375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 576.91796875, "completions/mean_terminated_length": 615.3792114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18666666666666668, "grad_norm": 0.0007179116364568472, "learning_rate": 6.944444444444446e-07, "loss": -0.0247, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9356477856636047, "mask/share_step_conf": 0.0018521937308833003, "num_tokens": 46707508.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 175 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0234375, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 661.20703125, "completions/mean_terminated_length": 702.3610229492188, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.18773333333333334, "grad_norm": 0.0, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9411953687667847, "mask/share_step_conf": 0.00021085733897052705, "num_tokens": 46980841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 176 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 613.859375, "completions/mean_terminated_length": 644.0491333007812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1888, "grad_norm": 0.0, "learning_rate": 6.388888888888889e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0004226059827487916, "mask/share_reasoning": 0.952595591545105, "mask/share_step_conf": 0.00010681642743293196, "num_tokens": 47241821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 177 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 615.19140625, "completions/mean_terminated_length": 650.7809448242188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18986666666666666, "grad_norm": 0.001339837210252881, "learning_rate": 6.111111111111112e-07, "loss": 0.0007, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.944941520690918, "mask/share_step_conf": 0.000371001660823822, "num_tokens": 47505382.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 178 }, { "adv/mean_abs_final_conf": 0.018928933888673782, "adv/mean_abs_reasoning": 0.01932401955127716, "adv/mean_abs_step_conf": 0.01928865909576416, "adv/ratio_final_to_reasoning": 0.9795546852167584, "adv/ratio_step_to_reasoning": 0.9981701293864266, "adv/std_final_conf": 0.16220416128635406, "adv/std_reasoning": 0.16558970510959625, "adv/std_step_conf": 0.16528668999671936, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.9391350031946583, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9391350031946583, "calib/mu_c": NaN, "calib/mu_w": 0.9391350031946583, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.9391350031946583, "calib/std_conf": 0.03413500319465823, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 636.2421875, "completions/mean_terminated_length": 667.5327758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.19093333333333334, "grad_norm": 0.06453783065080643, "learning_rate": 5.833333333333334e-07, "loss": -0.1462, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.001012221910059452, "mask/share_reasoning": 0.9516113996505737, "mask/share_step_conf": 0.0005013375193811953, "num_tokens": 47774524.0, "reward": 0.0024650082923471928, "reward_std": 0.006972096394747496, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00020603708981070668, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.000920747930649668, "step": 179 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 647.69921875, "completions/mean_terminated_length": 688.0125122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.192, "grad_norm": 0.0, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.938878059387207, "mask/share_step_conf": 0.002528225537389517, "num_tokens": 48044191.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 180 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 603.8125, "completions/mean_terminated_length": 663.4163208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.19306666666666666, "grad_norm": 0.0012316078646108508, "learning_rate": 5.277777777777779e-07, "loss": -0.0265, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9101225137710571, "mask/share_step_conf": 3.375771484570578e-05, "num_tokens": 48305031.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 181 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 574.73828125, "completions/mean_terminated_length": 607.987548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.19413333333333332, "grad_norm": 0.0, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9448782205581665, "mask/share_step_conf": 0.0004343033069744706, "num_tokens": 48558324.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 182 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.015625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 578.08984375, "completions/mean_terminated_length": 611.5330200195312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1952, "grad_norm": 0.0, "learning_rate": 4.7222222222222226e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9451446533203125, "mask/share_step_conf": 0.0001678102562436834, "num_tokens": 48812995.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 183 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 593.359375, "completions/mean_terminated_length": 638.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.19626666666666667, "grad_norm": 0.0, "learning_rate": 4.444444444444445e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9294921159744263, "mask/share_step_conf": 0.0001953368482645601, "num_tokens": 49070175.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 184 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 627.8359375, "completions/mean_terminated_length": 666.9129028320312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.19733333333333333, "grad_norm": 0.0, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9410565495491028, "mask/share_step_conf": 0.000349695939803496, "num_tokens": 49337821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 185 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 635.94140625, "completions/mean_terminated_length": 675.5228271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1984, "grad_norm": 0.0, "learning_rate": 3.8888888888888895e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9408471584320068, "mask/share_step_conf": 0.0005590926157310605, "num_tokens": 49605662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 186 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0078125, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 722.9453125, "completions/mean_terminated_length": 774.3681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.19946666666666665, "grad_norm": 0.000874101126100868, "learning_rate": 3.611111111111111e-07, "loss": -0.0042, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9335595369338989, "mask/share_step_conf": 3.423374073463492e-05, "num_tokens": 49892280.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 187 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.0390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0625, "calib/nonempty_step_conf_rate": 0.02734375, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 698.91796875, "completions/mean_terminated_length": 739.3511962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 6.0, "epoch": 0.20053333333333334, "grad_norm": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9445304870605469, "mask/share_step_conf": 0.0007820092723704875, "num_tokens": 50175275.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 188 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.015625, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 602.7265625, "completions/mean_terminated_length": 622.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.2016, "grad_norm": 0.0, "learning_rate": 3.055555555555556e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9686535596847534, "mask/share_step_conf": 9.648424020269886e-05, "num_tokens": 50437341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 189 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.9928518292682926, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9928518292682926, "calib/mu_c": NaN, "calib/mu_w": 0.9928518292682926, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9928518292682926, "calib/std_conf": 0.007148170731707315, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 573.69140625, "completions/mean_terminated_length": 614.4978637695312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.20266666666666666, "grad_norm": 0.0, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00041248349589295685, "mask/share_reasoning": 0.9331349730491638, "mask/share_step_conf": 4.62801763205789e-05, "num_tokens": 50689814.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 190 }, { "adv/mean_abs_final_conf": 0.01899779960513115, "adv/mean_abs_reasoning": 0.038653504103422165, "adv/mean_abs_step_conf": 0.019316211342811584, "adv/ratio_final_to_reasoning": 0.49148971214356707, "adv/ratio_step_to_reasoning": 0.49972730263027915, "adv/std_final_conf": 0.162794291973114, "adv/std_reasoning": 0.23421232402324677, "adv/std_step_conf": 0.16552278399467468, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.9688, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9688, "calib/mu_c": NaN, "calib/mu_w": 0.9688, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.9688, "calib/std_conf": 0.0011999999999999789, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 630.11328125, "completions/mean_terminated_length": 661.1024169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.20373333333333332, "grad_norm": 0.0887182429432869, "learning_rate": 2.5000000000000004e-07, "loss": -0.1527, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0001581465476192534, "mask/share_reasoning": 0.9523420333862305, "mask/share_step_conf": 0.0006248306017369032, "num_tokens": 50955291.0, "reward": 0.007060363423079252, "reward_std": 0.019969724118709564, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00024902436416596174, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.0022774646058678627, "step": 191 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.00390625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0078125, "calib/nonempty_step_conf_rate": 0.00390625, "calib/step_conf_rate": 0.00390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 590.32421875, "completions/mean_terminated_length": 627.06640625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.2048, "grad_norm": 0.0, "learning_rate": 2.2222222222222224e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9413959980010986, "mask/share_step_conf": 1.0268796359014232e-05, "num_tokens": 51211390.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 192 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.02734375, "calib/ece": 0.9600000000000001, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9600000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.9600000000000001, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.9600000000000001, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 637.46484375, "completions/mean_terminated_length": 663.3780517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.20586666666666667, "grad_norm": 0.0, "learning_rate": 1.9444444444444447e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00012885199976153672, "mask/share_reasoning": 0.960355281829834, "mask/share_step_conf": 0.0004534159670583904, "num_tokens": 51480293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 193 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.96, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.96, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 656.734375, "completions/mean_terminated_length": 709.3839111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.20693333333333333, "grad_norm": 0.0008238269947469234, "learning_rate": 1.6666666666666668e-07, "loss": -0.0312, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00026483050896786153, "mask/share_reasoning": 0.9254249930381775, "mask/share_step_conf": 9.144169598584995e-05, "num_tokens": 51754361.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 194 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019329484552145004, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16563653945922852, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.0078125, "calib/ece": 0.9601074217378754, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9601074217378754, "calib/mu_c": NaN, "calib/mu_w": 0.9601074217378754, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0078125, "calib/pce": 0.9601074217378754, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.0078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 696.2890625, "completions/mean_terminated_length": 742.7083740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.208, "grad_norm": 0.000848722702357918, "learning_rate": 1.3888888888888888e-07, "loss": 0.0067, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0006241945666261017, "mask/share_reasoning": 0.9368205070495605, "mask/share_step_conf": 5.5262891692109406e-05, "num_tokens": 52038595.0, "reward": 0.00390625, "reward_std": 0.011048543266952038, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 195 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.03125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0234375, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 692.03125, "completions/mean_terminated_length": 726.0655517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.20906666666666668, "grad_norm": 0.0, "learning_rate": 1.1111111111111112e-07, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9523943662643433, "mask/share_step_conf": 0.0007306202314794064, "num_tokens": 52318299.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 196 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.0234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01953125, "calib/step_conf_rate": 0.01953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 619.29296875, "completions/mean_terminated_length": 666.1303100585938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.21013333333333334, "grad_norm": 0.0, "learning_rate": 8.333333333333334e-08, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9287298321723938, "mask/share_step_conf": 0.0009576534503139555, "num_tokens": 52581894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 197 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.01171875, "calib/ece": 0.97, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.97, "calib/mu_c": NaN, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.02734375, "calib/nonempty_step_conf_rate": 0.01171875, "calib/pce": 0.97, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 646.1484375, "completions/mean_terminated_length": 683.5288696289062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.2112, "grad_norm": 0.0, "learning_rate": 5.555555555555556e-08, "loss": 0.0, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00013048943947069347, "mask/share_reasoning": 0.9449419379234314, "mask/share_step_conf": 0.00024005374871194363, "num_tokens": 52852692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 198 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.01953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 635.00390625, "completions/mean_terminated_length": 683.0294189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.21226666666666666, "grad_norm": 0.0, "learning_rate": 2.777777777777778e-08, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9284061789512634, "mask/share_step_conf": 0.0012813331559300423, "num_tokens": 53119453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 199 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01171875, "calib/step_conf_rate": 0.01171875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 669.09375, "completions/mean_terminated_length": 713.7000122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.21333333333333335, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.0, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9339677095413208, "mask/share_step_conf": 0.003532242262735963, "num_tokens": 53398789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/stepwise_brier_reward": 0.0, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.40800720210187136, "train_runtime": 11790.426, "train_samples_per_second": 4.343, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 53398789, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }