Files
PureRL-1.5B-v9E-digit-w050/trainer_state.json
ModelHub XC 6676b337b7 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v9E-digit-w050
Source: Original Platform
2026-06-04 17:43:19 +08:00

10202 lines
402 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.13436700403690338,
"adv/mean_abs_reasoning": 0.15610459446907043,
"adv/mean_abs_step_conf": 0.13763591647148132,
"adv/ratio_final_to_reasoning": 0.8607498356720436,
"adv/ratio_step_to_reasoning": 0.8816903624111565,
"adv/std_final_conf": 0.40337812900543213,
"adv/std_reasoning": 0.43819621205329895,
"adv/std_step_conf": 0.4055573642253876,
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.6944444444444445,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.6230769230769231,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.03861111111111115,
"calib/mean_conf": 0.9307692307692309,
"calib/mu_c": 0.9575,
"calib/mu_w": 0.9188888888888889,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.6230769230769231,
"calib/std_conf": 0.07965903671384378,
"calib/step_conf_rate": 0.0703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2955.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 613.67578125,
"completions/mean_terminated_length": 674.2532348632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.187005877494812,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.905,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0006368102040141821,
"mask/share_reasoning": 0.9053931832313538,
"mask/share_step_conf": 0.004126261919736862,
"num_tokens": 264685.0,
"reward": 0.05631820484995842,
"reward_std": 0.11161534488201141,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.01655624993145466,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.030139535665512085,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.24564749002456665,
"adv/mean_abs_reasoning": 0.308665931224823,
"adv/mean_abs_step_conf": 0.2689768671989441,
"adv/ratio_final_to_reasoning": 0.7958360971352048,
"adv/ratio_step_to_reasoning": 0.8714174127724819,
"adv/std_final_conf": 0.5673569440841675,
"adv/std_reasoning": 0.6196993589401245,
"adv/std_step_conf": 0.5733745098114014,
"calib/answer_extract_rate": 0.13671875,
"calib/auroc": 0.5338345864661654,
"calib/avg_num_step_conf": 0.55078125,
"calib/ece": 0.6261538461538463,
"calib/final_conf_rate": 0.1015625,
"calib/format_rate": 0.08984375,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.002406015037593856,
"calib/mean_conf": 0.8953846153846153,
"calib/mu_c": 0.897142857142857,
"calib/mu_w": 0.8947368421052632,
"calib/nonempty_final_conf_rate": 0.1015625,
"calib/nonempty_reasoning_rate": 0.14453125,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.6261538461538463,
"calib/std_conf": 0.18653172073466937,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 646.4609375,
"completions/mean_terminated_length": 683.8594970703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.18600612878799438,
"learning_rate": 5.000000000000001e-07,
"loss": -1.4124,
"mask/has_final_conf_rate": 0.1015625,
"mask/share_final_conf": 0.003081148024648428,
"mask/share_reasoning": 0.934806227684021,
"mask/share_step_conf": 0.007425096817314625,
"num_tokens": 533467.0,
"reward": 0.10996345430612564,
"reward_std": 0.20499171316623688,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.02965039201080799,
"rewards/format_reward_step": 0.08984375,
"rewards/stepwise_brier_reward": 0.05275794863700867,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.11627759039402008,
"adv/mean_abs_reasoning": 0.15320944786071777,
"adv/mean_abs_step_conf": 0.14957186579704285,
"adv/ratio_final_to_reasoning": 0.758945300160129,
"adv/ratio_step_to_reasoning": 0.9762574559567512,
"adv/std_final_conf": 0.397796094417572,
"adv/std_reasoning": 0.4381342828273773,
"adv/std_step_conf": 0.43789950013160706,
"calib/answer_extract_rate": 0.046875,
"calib/avg_num_step_conf": 0.26953125,
"calib/ece": 0.974,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.974,
"calib/mu_c": NaN,
"calib/mu_w": 0.974,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.974,
"calib/std_conf": 0.02537715508089904,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 3037.0,
"completions/max_terminated_length": 3037.0,
"completions/mean_length": 684.71875,
"completions/mean_terminated_length": 762.1217041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0032,
"grad_norm": 0.1434255689382553,
"learning_rate": 7.5e-07,
"loss": -0.7424,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.000625855871476233,
"mask/share_reasoning": 0.8943076133728027,
"mask/share_step_conf": 0.0035040113143622875,
"num_tokens": 814011.0,
"reward": 0.030487660318613052,
"reward_std": 0.06891795992851257,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0019796874839812517,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.020922981202602386,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.13574722409248352,
"adv/mean_abs_reasoning": 0.1447548270225525,
"adv/mean_abs_step_conf": 0.14417070150375366,
"adv/ratio_final_to_reasoning": 0.9377733847268139,
"adv/ratio_step_to_reasoning": 0.9959647251092509,
"adv/std_final_conf": 0.4322591722011566,
"adv/std_reasoning": 0.43817517161369324,
"adv/std_step_conf": 0.43792372941970825,
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.40476190476190477,
"calib/avg_num_step_conf": 0.2109375,
"calib/ece": 0.64,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": -0.02857142857142858,
"calib/mean_conf": 0.9399999999999998,
"calib/mu_c": 0.9199999999999999,
"calib/mu_w": 0.9485714285714285,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.64,
"calib/std_conf": 0.05848076606885379,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2934.0,
"completions/max_terminated_length": 2934.0,
"completions/mean_length": 707.953125,
"completions/mean_terminated_length": 774.5128784179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.18157421052455902,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.9007,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0004933603922836483,
"mask/share_reasoning": 0.9104673862457275,
"mask/share_step_conf": 0.003101823152974248,
"num_tokens": 1101415.0,
"reward": 0.04077763482928276,
"reward_std": 0.09365655481815338,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.010477343574166298,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.017722850665450096,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.19167664647102356,
"adv/mean_abs_reasoning": 0.2151871919631958,
"adv/mean_abs_step_conf": 0.19849944114685059,
"adv/ratio_final_to_reasoning": 0.8907437506959367,
"adv/ratio_step_to_reasoning": 0.9224500739839601,
"adv/std_final_conf": 0.519432544708252,
"adv/std_reasoning": 0.5492884516716003,
"adv/std_step_conf": 0.5234185457229614,
"calib/answer_extract_rate": 0.07421875,
"calib/auroc": 0.5104166666666666,
"calib/avg_num_step_conf": 0.35546875,
"calib/ece": 0.6356250000000001,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.6875,
"calib/gap": 0.07916666666666672,
"calib/mean_conf": 0.885625,
"calib/mu_c": 0.945,
"calib/mu_w": 0.8658333333333332,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.6356250000000001,
"calib/std_conf": 0.23192048071483465,
"calib/step_conf_rate": 0.0703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 2885.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 655.25390625,
"completions/mean_terminated_length": 732.5109252929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.1356576830148697,
"learning_rate": 1.25e-06,
"loss": -1.0562,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.0008796079782769084,
"mask/share_reasoning": 0.8888272643089294,
"mask/share_step_conf": 0.004824398085474968,
"num_tokens": 1375848.0,
"reward": 0.059928152710199356,
"reward_std": 0.16126090288162231,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.018272656947374344,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.02868872694671154,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.1842029094696045,
"adv/mean_abs_reasoning": 0.20916666090488434,
"adv/mean_abs_step_conf": 0.20033016800880432,
"adv/ratio_final_to_reasoning": 0.880651384272794,
"adv/ratio_step_to_reasoning": 0.9577538176597928,
"adv/std_final_conf": 0.49309083819389343,
"adv/std_reasoning": 0.49686720967292786,
"adv/std_step_conf": 0.4966629445552826,
"calib/answer_extract_rate": 0.10546875,
"calib/auroc": 0.4736842105263158,
"calib/avg_num_step_conf": 0.49609375,
"calib/ece": 0.7418363636363637,
"calib/final_conf_rate": 0.0859375,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.7272727272727273,
"calib/gap": 0.04454035087719299,
"calib/mean_conf": 0.8782,
"calib/mu_c": 0.9166666666666666,
"calib/mu_w": 0.8721263157894736,
"calib/nonempty_final_conf_rate": 0.0859375,
"calib/nonempty_reasoning_rate": 0.13671875,
"calib/nonempty_step_conf_rate": 0.09375,
"calib/pce": 0.7418363636363637,
"calib/std_conf": 0.23882599980282337,
"calib/step_conf_rate": 0.09375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 534.37890625,
"completions/mean_terminated_length": 577.2193603515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 0.1741025298833847,
"learning_rate": 1.5e-06,
"loss": -0.9792,
"mask/has_final_conf_rate": 0.0859375,
"mask/share_final_conf": 0.0018007527105510235,
"mask/share_reasoning": 0.9179291725158691,
"mask/share_step_conf": 0.006051314529031515,
"num_tokens": 1618601.0,
"reward": 0.06880377233028412,
"reward_std": 0.1430698037147522,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.021675746887922287,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.03692592680454254,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.10781514644622803,
"adv/mean_abs_reasoning": 0.1530671864748001,
"adv/mean_abs_step_conf": 0.11152852326631546,
"adv/ratio_final_to_reasoning": 0.7043648539524043,
"adv/ratio_step_to_reasoning": 0.7286246375520642,
"adv/std_final_conf": 0.36762505769729614,
"adv/std_reasoning": 0.43820399045944214,
"adv/std_step_conf": 0.37017136812210083,
"calib/answer_extract_rate": 0.1015625,
"calib/auroc": 0.52,
"calib/avg_num_step_conf": 0.28125,
"calib/ece": 0.6213333333333335,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.8666666666666667,
"calib/gap": 0.008000000000000007,
"calib/mean_conf": 0.9346666666666666,
"calib/mu_c": 0.9400000000000001,
"calib/mu_w": 0.932,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.12109375,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.6113333333333335,
"calib/std_conf": 0.07499925925560123,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2604.0,
"completions/max_terminated_length": 2604.0,
"completions/mean_length": 626.88671875,
"completions/mean_terminated_length": 697.7521362304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.0667913407087326,
"learning_rate": 1.75e-06,
"loss": -0.5539,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.001980168977752328,
"mask/share_reasoning": 0.893886387348175,
"mask/share_step_conf": 0.0025709576439112425,
"num_tokens": 1886508.0,
"reward": 0.05245732516050339,
"reward_std": 0.10757142305374146,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.013073046691715717,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.02025311440229416,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.20839053392410278,
"adv/mean_abs_reasoning": 0.21437883377075195,
"adv/mean_abs_step_conf": 0.21140068769454956,
"adv/ratio_final_to_reasoning": 0.9720667393262675,
"adv/ratio_step_to_reasoning": 0.9861080218423658,
"adv/std_final_conf": 0.521725058555603,
"adv/std_reasoning": 0.5237656831741333,
"adv/std_step_conf": 0.5234709978103638,
"calib/answer_extract_rate": 0.09375,
"calib/auroc": 0.45714285714285713,
"calib/avg_num_step_conf": 0.43359375,
"calib/ece": 0.36470588235294127,
"calib/final_conf_rate": 0.06640625,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.8235294117647058,
"calib/gap": 0.0345714285714287,
"calib/mean_conf": 0.9317647058823529,
"calib/mu_c": 0.9460000000000001,
"calib/mu_w": 0.9114285714285714,
"calib/nonempty_final_conf_rate": 0.06640625,
"calib/nonempty_reasoning_rate": 0.12890625,
"calib/nonempty_step_conf_rate": 0.1015625,
"calib/pce": 0.35411764705882365,
"calib/std_conf": 0.08806650522508859,
"calib/step_conf_rate": 0.1015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 2939.0,
"completions/max_terminated_length": 2939.0,
"completions/mean_length": 635.58984375,
"completions/mean_terminated_length": 716.7885131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.1552148461341858,
"learning_rate": 2.0000000000000003e-06,
"loss": -1.2229,
"mask/has_final_conf_rate": 0.06640625,
"mask/share_final_conf": 0.0010213814675807953,
"mask/share_reasoning": 0.8800764083862305,
"mask/share_step_conf": 0.00562096806243062,
"num_tokens": 2155731.0,
"reward": 0.10557056963443756,
"reward_std": 0.2017257809638977,
"rewards/accuracy_reward_step": 0.04296875,
"rewards/final_brier_reward_step": 0.04262109100818634,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.04529935121536255,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.1774379312992096,
"adv/mean_abs_reasoning": 0.20591090619564056,
"adv/mean_abs_step_conf": 0.18646956980228424,
"adv/ratio_final_to_reasoning": 0.86172187077173,
"adv/ratio_step_to_reasoning": 0.9055837461329772,
"adv/std_final_conf": 0.4876402020454407,
"adv/std_reasoning": 0.5237042903900146,
"adv/std_step_conf": 0.49650344252586365,
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.24999999999999997,
"calib/avg_num_step_conf": 0.26953125,
"calib/ece": 0.7626666666666667,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": -0.1742307692307693,
"calib/mean_conf": 0.896,
"calib/mu_c": 0.745,
"calib/mu_w": 0.9192307692307693,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.06640625,
"calib/pce": 0.7626666666666667,
"calib/std_conf": 0.14627827362029308,
"calib/step_conf_rate": 0.06640625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 669.16796875,
"completions/mean_terminated_length": 732.0812377929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0096,
"grad_norm": 0.16828308999538422,
"learning_rate": 2.25e-06,
"loss": -1.0588,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.0011211318196728826,
"mask/share_reasoning": 0.909247875213623,
"mask/share_step_conf": 0.0036935298703610897,
"num_tokens": 2434574.0,
"reward": 0.0528482049703598,
"reward_std": 0.1220763698220253,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.010931640863418579,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.026011832058429718,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.13913732767105103,
"adv/mean_abs_reasoning": 0.16954657435417175,
"adv/mean_abs_step_conf": 0.14402230083942413,
"adv/ratio_final_to_reasoning": 0.820643697468061,
"adv/ratio_step_to_reasoning": 0.8494556813549705,
"adv/std_final_conf": 0.43238168954849243,
"adv/std_reasoning": 0.4683932960033417,
"adv/std_step_conf": 0.4379076659679413,
"calib/answer_extract_rate": 0.08984375,
"calib/auroc": 0.1428571428571429,
"calib/avg_num_step_conf": 0.3984375,
"calib/ece": 0.8740000000000001,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": -0.04785714285714271,
"calib/mean_conf": 0.9246666666666667,
"calib/mu_c": 0.88,
"calib/mu_w": 0.9278571428571427,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.11328125,
"calib/nonempty_step_conf_rate": 0.07421875,
"calib/pce": 0.8660000000000001,
"calib/std_conf": 0.09258269576738169,
"calib/step_conf_rate": 0.07421875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2864.0,
"completions/max_terminated_length": 2864.0,
"completions/mean_length": 595.4765625,
"completions/mean_terminated_length": 645.940673828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.1307852417230606,
"learning_rate": 2.5e-06,
"loss": -0.9333,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.001026952755637467,
"mask/share_reasoning": 0.9161863923072815,
"mask/share_step_conf": 0.0046616545878350735,
"num_tokens": 2693816.0,
"reward": 0.034988999366760254,
"reward_std": 0.08137714862823486,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.004281249828636646,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.020962368696928024,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.23236548900604248,
"adv/mean_abs_reasoning": 0.24977372586727142,
"adv/mean_abs_step_conf": 0.2524455487728119,
"adv/ratio_final_to_reasoning": 0.9303039709209463,
"adv/ratio_step_to_reasoning": 1.0106969734156916,
"adv/std_final_conf": 0.5680802464485168,
"adv/std_reasoning": 0.5737043023109436,
"adv/std_step_conf": 0.5733542442321777,
"calib/answer_extract_rate": 0.1171875,
"calib/auroc": 0.48214285714285715,
"calib/avg_num_step_conf": 0.48828125,
"calib/ece": 0.7459999999999999,
"calib/final_conf_rate": 0.09765625,
"calib/format_rate": 0.0703125,
"calib/frac_conf_gt_0.9": 0.72,
"calib/gap": -0.0635714285714285,
"calib/mean_conf": 0.8483999999999999,
"calib/mu_c": 0.7949999999999999,
"calib/mu_w": 0.8585714285714284,
"calib/nonempty_final_conf_rate": 0.09765625,
"calib/nonempty_reasoning_rate": 0.140625,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.7172,
"calib/std_conf": 0.2857646584166768,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 635.75390625,
"completions/mean_terminated_length": 698.5107421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.22519375383853912,
"learning_rate": 2.7500000000000004e-06,
"loss": -1.1239,
"mask/has_final_conf_rate": 0.09765625,
"mask/share_final_conf": 0.0028018150478601456,
"mask/share_reasoning": 0.9016100168228149,
"mask/share_step_conf": 0.005744474474340677,
"num_tokens": 2961049.0,
"reward": 0.07517063617706299,
"reward_std": 0.16879817843437195,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.019722266122698784,
"rewards/format_reward_step": 0.0703125,
"rewards/stepwise_brier_reward": 0.03891763836145401,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.20684903860092163,
"adv/mean_abs_reasoning": 0.22811613976955414,
"adv/mean_abs_step_conf": 0.23255550861358643,
"adv/ratio_final_to_reasoning": 0.906770730075843,
"adv/ratio_step_to_reasoning": 1.0194610028405575,
"adv/std_final_conf": 0.517840564250946,
"adv/std_reasoning": 0.549264132976532,
"adv/std_step_conf": 0.5490169525146484,
"calib/answer_extract_rate": 0.1171875,
"calib/auroc": 0.5750000000000001,
"calib/avg_num_step_conf": 0.359375,
"calib/ece": 0.6822222222222221,
"calib/final_conf_rate": 0.09375,
"calib/format_rate": 0.0625,
"calib/frac_conf_gt_0.9": 0.7083333333333334,
"calib/gap": 0.07033333333333336,
"calib/mean_conf": 0.8163888888888889,
"calib/mu_c": 0.875,
"calib/mu_w": 0.8046666666666666,
"calib/nonempty_final_conf_rate": 0.09375,
"calib/nonempty_reasoning_rate": 0.12890625,
"calib/nonempty_step_conf_rate": 0.0859375,
"calib/pce": 0.6659722222222221,
"calib/std_conf": 0.2886413202027649,
"calib/step_conf_rate": 0.0859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3065.0,
"completions/max_terminated_length": 3065.0,
"completions/mean_length": 692.40234375,
"completions/mean_terminated_length": 751.0805053710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0128,
"grad_norm": 0.1815670132637024,
"learning_rate": 3e-06,
"loss": -1.1994,
"mask/has_final_conf_rate": 0.09375,
"mask/share_final_conf": 0.0017455201596021652,
"mask/share_reasoning": 0.9151567220687866,
"mask/share_step_conf": 0.00497277919203043,
"num_tokens": 3242480.0,
"reward": 0.07485318183898926,
"reward_std": 0.16080008447170258,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.024570703506469727,
"rewards/format_reward_step": 0.0625,
"rewards/stepwise_brier_reward": 0.043671008199453354,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.1520506739616394,
"adv/mean_abs_reasoning": 0.1872754991054535,
"adv/mean_abs_step_conf": 0.1856890469789505,
"adv/ratio_final_to_reasoning": 0.8119090574470756,
"adv/ratio_step_to_reasoning": 0.9915287790763827,
"adv/std_final_conf": 0.46135035157203674,
"adv/std_reasoning": 0.4968181252479553,
"adv/std_step_conf": 0.49658989906311035,
"calib/answer_extract_rate": 0.1015625,
"calib/auroc": 0.5769230769230769,
"calib/avg_num_step_conf": 0.38671875,
"calib/ece": 0.7679999999999998,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.0546875,
"calib/frac_conf_gt_0.9": 0.9333333333333333,
"calib/gap": 0.07923076923076933,
"calib/mean_conf": 0.9013333333333333,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8907692307692306,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.12890625,
"calib/nonempty_step_conf_rate": 0.0859375,
"calib/pce": 0.7679999999999998,
"calib/std_conf": 0.23391926432472854,
"calib/step_conf_rate": 0.0859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3057.0,
"completions/max_terminated_length": 3057.0,
"completions/mean_length": 635.34765625,
"completions/mean_terminated_length": 701.0733032226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.15629175305366516,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.8651,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.0008117250981740654,
"mask/share_reasoning": 0.9016801714897156,
"mask/share_step_conf": 0.003758120583370328,
"num_tokens": 3509721.0,
"reward": 0.055212050676345825,
"reward_std": 0.11932926625013351,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.014823437668383121,
"rewards/format_reward_step": 0.0546875,
"rewards/stepwise_brier_reward": 0.03269988298416138,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.2434786558151245,
"adv/mean_abs_reasoning": 0.2808641195297241,
"adv/mean_abs_step_conf": 0.2866297662258148,
"adv/ratio_final_to_reasoning": 0.8668912790384281,
"adv/ratio_step_to_reasoning": 1.0205282422893485,
"adv/std_final_conf": 0.567785382270813,
"adv/std_reasoning": 0.5971327424049377,
"adv/std_step_conf": 0.5968067646026611,
"calib/answer_extract_rate": 0.11328125,
"calib/auroc": 0.27976190476190477,
"calib/avg_num_step_conf": 0.453125,
"calib/ece": 0.7948,
"calib/final_conf_rate": 0.09765625,
"calib/format_rate": 0.08203125,
"calib/frac_conf_gt_0.9": 0.88,
"calib/gap": -0.04738095238095241,
"calib/mean_conf": 0.9548000000000001,
"calib/mu_c": 0.915,
"calib/mu_w": 0.9623809523809524,
"calib/nonempty_final_conf_rate": 0.09765625,
"calib/nonempty_reasoning_rate": 0.140625,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.7948,
"calib/std_conf": 0.04622726468221972,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 2942.0,
"completions/max_terminated_length": 2942.0,
"completions/mean_length": 567.47265625,
"completions/mean_terminated_length": 639.9691162109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.13358238339424133,
"learning_rate": 3.5e-06,
"loss": -1.1973,
"mask/has_final_conf_rate": 0.09765625,
"mask/share_final_conf": 0.0016311781946569681,
"mask/share_reasoning": 0.8782390356063843,
"mask/share_step_conf": 0.0068485308438539505,
"num_tokens": 3760394.0,
"reward": 0.08635027706623077,
"reward_std": 0.18285219371318817,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0193667970597744,
"rewards/format_reward_step": 0.08203125,
"rewards/stepwise_brier_reward": 0.0497359000146389,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.1690009981393814,
"adv/mean_abs_reasoning": 0.21810534596443176,
"adv/mean_abs_step_conf": 0.17306694388389587,
"adv/ratio_final_to_reasoning": 0.7748594945808518,
"adv/ratio_step_to_reasoning": 0.7935016132622413,
"adv/std_final_conf": 0.4642001986503601,
"adv/std_reasoning": 0.5237811803817749,
"adv/std_step_conf": 0.4681417942047119,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.46875,
"calib/avg_num_step_conf": 0.33984375,
"calib/ece": 0.39374999999999993,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.5625,
"calib/gap": -0.0050000000000000044,
"calib/mean_conf": 0.8724999999999999,
"calib/mu_c": 0.87,
"calib/mu_w": 0.875,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.38312499999999994,
"calib/std_conf": 0.12695963925594622,
"calib/step_conf_rate": 0.0703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 645.37109375,
"completions/mean_terminated_length": 718.3260498046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.016,
"grad_norm": 0.21051011979579926,
"learning_rate": 3.7500000000000005e-06,
"loss": -1.0117,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.002414593007415533,
"mask/share_reasoning": 0.8917868137359619,
"mask/share_step_conf": 0.004236122127622366,
"num_tokens": 4033489.0,
"reward": 0.07319016009569168,
"reward_std": 0.1653515249490738,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.02371484413743019,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.025147901847958565,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.140500009059906,
"adv/mean_abs_reasoning": 0.15322402119636536,
"adv/mean_abs_step_conf": 0.15020135045051575,
"adv/ratio_final_to_reasoning": 0.9169581111557384,
"adv/ratio_step_to_reasoning": 0.9802728663413951,
"adv/std_final_conf": 0.4350675642490387,
"adv/std_reasoning": 0.43818145990371704,
"adv/std_step_conf": 0.4379708468914032,
"calib/answer_extract_rate": 0.0546875,
"calib/auroc": 0.888888888888889,
"calib/avg_num_step_conf": 0.2109375,
"calib/ece": 0.7154545454545455,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8181818181818182,
"calib/gap": 0.10111111111111104,
"calib/mean_conf": 0.897272727272727,
"calib/mu_c": 0.98,
"calib/mu_w": 0.8788888888888889,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.7154545454545455,
"calib/std_conf": 0.1878147495946185,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12109375,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 704.78515625,
"completions/mean_terminated_length": 801.888916015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.16761568188667297,
"learning_rate": 4.000000000000001e-06,
"loss": -0.9862,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.001200198195874691,
"mask/share_reasoning": 0.8753113150596619,
"mask/share_step_conf": 0.0023947488516569138,
"num_tokens": 4322762.0,
"reward": 0.04348303750157356,
"reward_std": 0.1033376008272171,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.013904296793043613,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.025326423346996307,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.16247119009494781,
"adv/mean_abs_reasoning": 0.204107403755188,
"adv/mean_abs_step_conf": 0.171944722533226,
"adv/ratio_final_to_reasoning": 0.7960083128087808,
"adv/ratio_step_to_reasoning": 0.8424227606141188,
"adv/std_final_conf": 0.4362327754497528,
"adv/std_reasoning": 0.46846750378608704,
"adv/std_step_conf": 0.43806710839271545,
"calib/answer_extract_rate": 0.1171875,
"calib/auroc": 0.5491071428571428,
"calib/avg_num_step_conf": 0.5625,
"calib/ece": 0.6430434782608694,
"calib/final_conf_rate": 0.08984375,
"calib/format_rate": 0.07421875,
"calib/frac_conf_gt_0.9": 0.8260869565217391,
"calib/gap": -0.002410714285714266,
"calib/mean_conf": 0.947391304347826,
"calib/mu_c": 0.9457142857142856,
"calib/mu_w": 0.9481249999999999,
"calib/nonempty_final_conf_rate": 0.08984375,
"calib/nonempty_reasoning_rate": 0.14453125,
"calib/nonempty_step_conf_rate": 0.10546875,
"calib/pce": 0.6430434782608694,
"calib/std_conf": 0.05317872991798486,
"calib/step_conf_rate": 0.10546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2968.0,
"completions/max_terminated_length": 2968.0,
"completions/mean_length": 631.60546875,
"completions/mean_terminated_length": 682.240478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.1345282644033432,
"learning_rate": 4.25e-06,
"loss": -1.0562,
"mask/has_final_conf_rate": 0.08984375,
"mask/share_final_conf": 0.0014626241754740477,
"mask/share_reasoning": 0.9170229434967041,
"mask/share_step_conf": 0.00729574216529727,
"num_tokens": 4587981.0,
"reward": 0.10108289122581482,
"reward_std": 0.13088199496269226,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.029129687696695328,
"rewards/format_reward_step": 0.07421875,
"rewards/stepwise_brier_reward": 0.05088218301534653,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.14412738382816315,
"adv/mean_abs_reasoning": 0.14765673875808716,
"adv/mean_abs_step_conf": 0.14979150891304016,
"adv/ratio_final_to_reasoning": 0.9760975695412973,
"adv/ratio_step_to_reasoning": 1.0144576547803246,
"adv/std_final_conf": 0.43377164006233215,
"adv/std_reasoning": 0.43817782402038574,
"adv/std_step_conf": 0.4377943277359009,
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.22916666666666666,
"calib/avg_num_step_conf": 0.2109375,
"calib/ece": 0.6718181818181819,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8181818181818182,
"calib/gap": 0.01000000000000012,
"calib/mean_conf": 0.922727272727273,
"calib/mu_c": 0.93,
"calib/mu_w": 0.9199999999999999,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.660909090909091,
"calib/std_conf": 0.10514845860547571,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 772.25390625,
"completions/mean_terminated_length": 837.6991577148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0192,
"grad_norm": 0.13379941880702972,
"learning_rate": 4.5e-06,
"loss": -0.9989,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0005710519617423415,
"mask/share_reasoning": 0.918830394744873,
"mask/share_step_conf": 0.002473499160259962,
"num_tokens": 4896398.0,
"reward": 0.049791865050792694,
"reward_std": 0.1002160832285881,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.015680858865380287,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.021430809050798416,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.17932841181755066,
"adv/mean_abs_reasoning": 0.19484557211399078,
"adv/mean_abs_step_conf": 0.19124746322631836,
"adv/ratio_final_to_reasoning": 0.9203617504463376,
"adv/ratio_step_to_reasoning": 0.9815335352575145,
"adv/std_final_conf": 0.4871034622192383,
"adv/std_reasoning": 0.4967937469482422,
"adv/std_step_conf": 0.4966728985309601,
"calib/answer_extract_rate": 0.0859375,
"calib/avg_num_step_conf": 0.28125,
"calib/ece": 0.8526666666666669,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.8,
"calib/mean_conf": 0.8526666666666667,
"calib/mu_c": NaN,
"calib/mu_w": 0.8526666666666667,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.8526666666666669,
"calib/std_conf": 0.26541707221319094,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2996.0,
"completions/max_terminated_length": 2996.0,
"completions/mean_length": 555.2578125,
"completions/mean_terminated_length": 646.1181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.18200555443763733,
"learning_rate": 4.75e-06,
"loss": -1.137,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.0014445590786635876,
"mask/share_reasoning": 0.8540744781494141,
"mask/share_step_conf": 0.0038559352979063988,
"num_tokens": 5143304.0,
"reward": 0.0447770394384861,
"reward_std": 0.09865927696228027,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.01148046925663948,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.03303259238600731,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.28337258100509644,
"adv/mean_abs_reasoning": 0.3037603497505188,
"adv/mean_abs_step_conf": 0.306016206741333,
"adv/ratio_final_to_reasoning": 0.932882060604134,
"adv/ratio_step_to_reasoning": 1.0074264366388404,
"adv/std_final_conf": 0.6145702004432678,
"adv/std_reasoning": 0.6196979284286499,
"adv/std_step_conf": 0.6194650530815125,
"calib/answer_extract_rate": 0.14453125,
"calib/auroc": 0.5138888888888888,
"calib/avg_num_step_conf": 0.59375,
"calib/ece": 0.6146153846153846,
"calib/final_conf_rate": 0.1015625,
"calib/format_rate": 0.0859375,
"calib/frac_conf_gt_0.9": 0.8076923076923077,
"calib/gap": 0.02736111111111106,
"calib/mean_conf": 0.9223076923076923,
"calib/mu_c": 0.9412499999999999,
"calib/mu_w": 0.9138888888888889,
"calib/nonempty_final_conf_rate": 0.1015625,
"calib/nonempty_reasoning_rate": 0.16796875,
"calib/nonempty_step_conf_rate": 0.1171875,
"calib/pce": 0.6146153846153846,
"calib/std_conf": 0.1132494758795035,
"calib/step_conf_rate": 0.1171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 666.19140625,
"completions/mean_terminated_length": 728.8248291015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.23702050745487213,
"learning_rate": 5e-06,
"loss": -1.8863,
"mask/has_final_conf_rate": 0.1015625,
"mask/share_final_conf": 0.0014831081498414278,
"mask/share_reasoning": 0.9051233530044556,
"mask/share_step_conf": 0.00745608052238822,
"num_tokens": 5418721.0,
"reward": 0.11435748636722565,
"reward_std": 0.2429792582988739,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.04015117138624191,
"rewards/format_reward_step": 0.0859375,
"rewards/stepwise_brier_reward": 0.060201872140169144,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.2655344605445862,
"adv/mean_abs_reasoning": 0.27671802043914795,
"adv/mean_abs_step_conf": 0.27363383769989014,
"adv/ratio_final_to_reasoning": 0.9595849960302058,
"adv/ratio_step_to_reasoning": 0.9888544203432676,
"adv/std_final_conf": 0.5929623246192932,
"adv/std_reasoning": 0.5971682071685791,
"adv/std_step_conf": 0.5968631505966187,
"calib/answer_extract_rate": 0.12109375,
"calib/auroc": 0.7232142857142857,
"calib/avg_num_step_conf": 0.4375,
"calib/ece": 0.6221739130434784,
"calib/final_conf_rate": 0.08984375,
"calib/format_rate": 0.078125,
"calib/frac_conf_gt_0.9": 0.6956521739130435,
"calib/gap": 0.031696428571428736,
"calib/mean_conf": 0.9265217391304347,
"calib/mu_c": 0.9485714285714286,
"calib/mu_w": 0.9168749999999999,
"calib/nonempty_final_conf_rate": 0.08984375,
"calib/nonempty_reasoning_rate": 0.14453125,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.6221739130434784,
"calib/std_conf": 0.06210850922952952,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 637.85546875,
"completions/mean_terminated_length": 686.0966796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0224,
"grad_norm": 0.16225571930408478,
"learning_rate": 4.9722222222222224e-06,
"loss": -1.432,
"mask/has_final_conf_rate": 0.08984375,
"mask/share_final_conf": 0.0013876496814191341,
"mask/share_reasoning": 0.9224339723587036,
"mask/share_step_conf": 0.005865876562893391,
"num_tokens": 5684972.0,
"reward": 0.10086153447628021,
"reward_std": 0.22441299259662628,
"rewards/accuracy_reward_step": 0.02734375,
"rewards/final_brier_reward_step": 0.03598632663488388,
"rewards/format_reward_step": 0.078125,
"rewards/stepwise_brier_reward": 0.05091741681098938,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.21784275770187378,
"adv/mean_abs_reasoning": 0.23352275788784027,
"adv/mean_abs_step_conf": 0.23176658153533936,
"adv/ratio_final_to_reasoning": 0.9328545092230475,
"adv/ratio_step_to_reasoning": 0.992479635096874,
"adv/std_final_conf": 0.5453119874000549,
"adv/std_reasoning": 0.549296498298645,
"adv/std_step_conf": 0.5490684509277344,
"calib/answer_extract_rate": 0.10546875,
"calib/auroc": 0.6166666666666667,
"calib/avg_num_step_conf": 0.57421875,
"calib/ece": 0.6299999999999999,
"calib/final_conf_rate": 0.07421875,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.631578947368421,
"calib/gap": 0.13549999999999995,
"calib/mean_conf": 0.8405263157894737,
"calib/mu_c": 0.9474999999999999,
"calib/mu_w": 0.8119999999999999,
"calib/nonempty_final_conf_rate": 0.07421875,
"calib/nonempty_reasoning_rate": 0.140625,
"calib/nonempty_step_conf_rate": 0.1015625,
"calib/pce": 0.6299999999999999,
"calib/std_conf": 0.2554144966460457,
"calib/step_conf_rate": 0.1015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 626.9453125,
"completions/mean_terminated_length": 682.97021484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.24337080121040344,
"learning_rate": 4.944444444444445e-06,
"loss": -1.2015,
"mask/has_final_conf_rate": 0.07421875,
"mask/share_final_conf": 0.0015047264751046896,
"mask/share_reasoning": 0.9101567268371582,
"mask/share_step_conf": 0.0063072992488741875,
"num_tokens": 5947286.0,
"reward": 0.07395316660404205,
"reward_std": 0.18006539344787598,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.029487499967217445,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.043318841606378555,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.17735733091831207,
"adv/mean_abs_reasoning": 0.18752095103263855,
"adv/mean_abs_step_conf": 0.1866583526134491,
"adv/ratio_final_to_reasoning": 0.9458000822928982,
"adv/ratio_step_to_reasoning": 0.9953999890975419,
"adv/std_final_conf": 0.4912124276161194,
"adv/std_reasoning": 0.4968550503253937,
"adv/std_step_conf": 0.4964953660964966,
"calib/answer_extract_rate": 0.09375,
"calib/auroc": 0.4375,
"calib/avg_num_step_conf": 0.3046875,
"calib/ece": 0.7299999999999999,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.8888888888888888,
"calib/gap": -0.0028571428571428914,
"calib/mean_conf": 0.9522222222222223,
"calib/mu_c": 0.95,
"calib/mu_w": 0.9528571428571428,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.10546875,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.7299999999999999,
"calib/std_conf": 0.04429140317332165,
"calib/step_conf_rate": 0.0703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 617.796875,
"completions/mean_terminated_length": 675.88037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.13091301918029785,
"learning_rate": 4.9166666666666665e-06,
"loss": -1.0606,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.0017373452428728342,
"mask/share_reasoning": 0.9078471660614014,
"mask/share_step_conf": 0.004478019662201405,
"num_tokens": 6209378.0,
"reward": 0.05653750151395798,
"reward_std": 0.12422418594360352,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.014636717736721039,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.023725392296910286,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.19051101803779602,
"adv/mean_abs_reasoning": 0.20617303252220154,
"adv/mean_abs_step_conf": 0.20002737641334534,
"adv/ratio_final_to_reasoning": 0.9240346116424369,
"adv/ratio_step_to_reasoning": 0.9701917557612952,
"adv/std_final_conf": 0.4938734769821167,
"adv/std_reasoning": 0.49688783288002014,
"adv/std_step_conf": 0.496569961309433,
"calib/answer_extract_rate": 0.12890625,
"calib/auroc": 0.5164835164835164,
"calib/avg_num_step_conf": 0.71484375,
"calib/ece": 0.5834999999999997,
"calib/final_conf_rate": 0.078125,
"calib/format_rate": 0.07421875,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.03417582417582432,
"calib/mean_conf": 0.9334999999999999,
"calib/mu_c": 0.9557142857142857,
"calib/mu_w": 0.9215384615384614,
"calib/nonempty_final_conf_rate": 0.078125,
"calib/nonempty_reasoning_rate": 0.1640625,
"calib/nonempty_step_conf_rate": 0.125,
"calib/pce": 0.5834999999999997,
"calib/std_conf": 0.08193137372215849,
"calib/step_conf_rate": 0.125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 726.30859375,
"completions/mean_terminated_length": 791.2127685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0256,
"grad_norm": 0.13573439419269562,
"learning_rate": 4.888888888888889e-06,
"loss": -0.9367,
"mask/has_final_conf_rate": 0.078125,
"mask/share_final_conf": 0.0010043885558843613,
"mask/share_reasoning": 0.9085052609443665,
"mask/share_step_conf": 0.008459066040813923,
"num_tokens": 6499825.0,
"reward": 0.09818961471319199,
"reward_std": 0.15720906853675842,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.034287892282009125,
"rewards/format_reward_step": 0.07421875,
"rewards/stepwise_brier_reward": 0.04251652956008911,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.2048564851284027,
"adv/mean_abs_reasoning": 0.2124483287334442,
"adv/mean_abs_step_conf": 0.21995320916175842,
"adv/ratio_final_to_reasoning": 0.9642649878664525,
"adv/ratio_step_to_reasoning": 1.0353256741206491,
"adv/std_final_conf": 0.5154340863227844,
"adv/std_reasoning": 0.5237152576446533,
"adv/std_step_conf": 0.523485541343689,
"calib/answer_extract_rate": 0.10546875,
"calib/auroc": 0.661764705882353,
"calib/avg_num_step_conf": 0.578125,
"calib/ece": 0.7214285714285712,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.0703125,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.06867647058823534,
"calib/mean_conf": 0.9119047619047619,
"calib/mu_c": 0.9675,
"calib/mu_w": 0.8988235294117647,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.14453125,
"calib/nonempty_step_conf_rate": 0.12109375,
"calib/pce": 0.7214285714285712,
"calib/std_conf": 0.1858985654705031,
"calib/step_conf_rate": 0.12109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 589.234375,
"completions/mean_terminated_length": 641.8893432617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.14066976308822632,
"learning_rate": 4.861111111111111e-06,
"loss": -1.2565,
"mask/has_final_conf_rate": 0.08203125,
"mask/share_final_conf": 0.001033964566886425,
"mask/share_reasoning": 0.9100552201271057,
"mask/share_step_conf": 0.0068795569241046906,
"num_tokens": 6753893.0,
"reward": 0.08146456629037857,
"reward_std": 0.14879143238067627,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.021036718040704727,
"rewards/format_reward_step": 0.0703125,
"rewards/stepwise_brier_reward": 0.05084826797246933,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.1946616768836975,
"adv/mean_abs_reasoning": 0.22415024042129517,
"adv/mean_abs_step_conf": 0.22112242877483368,
"adv/ratio_final_to_reasoning": 0.8684428645618525,
"adv/ratio_step_to_reasoning": 0.9864920437258035,
"adv/std_final_conf": 0.520853579044342,
"adv/std_reasoning": 0.5492832064628601,
"adv/std_step_conf": 0.5490449070930481,
"calib/answer_extract_rate": 0.11328125,
"calib/auroc": 0.6428571428571428,
"calib/avg_num_step_conf": 0.83203125,
"calib/ece": 0.6833333333333332,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.0546875,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.0635714285714285,
"calib/mean_conf": 0.9055555555555556,
"calib/mu_c": 0.955,
"calib/mu_w": 0.8914285714285715,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.1484375,
"calib/nonempty_step_conf_rate": 0.10546875,
"calib/pce": 0.6833333333333332,
"calib/std_conf": 0.13136980129146128,
"calib/step_conf_rate": 0.10546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2874.0,
"completions/max_terminated_length": 2874.0,
"completions/mean_length": 614.3125,
"completions/mean_terminated_length": 674.9527587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.2409486472606659,
"learning_rate": 4.833333333333333e-06,
"loss": -1.2538,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.00079687888501212,
"mask/share_reasoning": 0.903478741645813,
"mask/share_step_conf": 0.005880659446120262,
"num_tokens": 7016397.0,
"reward": 0.06622931361198425,
"reward_std": 0.1676761656999588,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.023710157722234726,
"rewards/format_reward_step": 0.0546875,
"rewards/stepwise_brier_reward": 0.034666046500205994,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.17820599675178528,
"adv/mean_abs_reasoning": 0.2028244286775589,
"adv/mean_abs_step_conf": 0.1893235743045807,
"adv/ratio_final_to_reasoning": 0.8786219584776404,
"adv/ratio_step_to_reasoning": 0.9334357579064538,
"adv/std_final_conf": 0.4923804700374603,
"adv/std_reasoning": 0.5237365365028381,
"adv/std_step_conf": 0.4966087341308594,
"calib/answer_extract_rate": 0.10546875,
"calib/auroc": 0.6,
"calib/avg_num_step_conf": 0.3125,
"calib/ece": 0.6275000000000001,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.02036363636363636,
"calib/mean_conf": 0.94,
"calib/mu_c": 0.954,
"calib/mu_w": 0.9336363636363636,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.11328125,
"calib/nonempty_step_conf_rate": 0.078125,
"calib/pce": 0.6275000000000001,
"calib/std_conf": 0.04703721930556695,
"calib/step_conf_rate": 0.078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2857.0,
"completions/max_terminated_length": 2857.0,
"completions/mean_length": 603.59765625,
"completions/mean_terminated_length": 654.75,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0288,
"grad_norm": 0.1442146897315979,
"learning_rate": 4.805555555555556e-06,
"loss": -1.0975,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.0009431581711396575,
"mask/share_reasoning": 0.9170562028884888,
"mask/share_step_conf": 0.003875626251101494,
"num_tokens": 7276134.0,
"reward": 0.0711732804775238,
"reward_std": 0.15970629453659058,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.02380312606692314,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.03278874605894089,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.11494233459234238,
"adv/mean_abs_reasoning": 0.11595869064331055,
"adv/mean_abs_step_conf": 0.11588902771472931,
"adv/ratio_final_to_reasoning": 0.9912351886233824,
"adv/ratio_step_to_reasoning": 0.9993992435737695,
"adv/std_final_conf": 0.40212392807006836,
"adv/std_reasoning": 0.4056612551212311,
"adv/std_step_conf": 0.4054175913333893,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.1875,
"calib/ece": 0.5733333333333334,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.03499999999999992,
"calib/mean_conf": 0.9066666666666667,
"calib/mu_c": 0.9299999999999999,
"calib/mu_w": 0.895,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.5733333333333334,
"calib/std_conf": 0.09551032521262937,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 747.94921875,
"completions/mean_terminated_length": 843.502197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.13489677011966705,
"learning_rate": 4.777777777777778e-06,
"loss": -0.8107,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.00030747134587727487,
"mask/share_reasoning": 0.8835909962654114,
"mask/share_step_conf": 0.00282029015943408,
"num_tokens": 7574553.0,
"reward": 0.02977309562265873,
"reward_std": 0.08421102911233902,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.010675780475139618,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.015145798213779926,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.13464173674583435,
"adv/mean_abs_reasoning": 0.1608918309211731,
"adv/mean_abs_step_conf": 0.16683977842330933,
"adv/ratio_final_to_reasoning": 0.8368463207544723,
"adv/ratio_step_to_reasoning": 1.0369686109486214,
"adv/std_final_conf": 0.4364698529243469,
"adv/std_reasoning": 0.46844682097435,
"adv/std_step_conf": 0.46820372343063354,
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.513888888888889,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.6437692307692308,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.9230769230769231,
"calib/gap": 0.012333333333333307,
"calib/mean_conf": 0.9514615384615385,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9476666666666667,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.06640625,
"calib/pce": 0.6437692307692308,
"calib/std_conf": 0.04967063113785786,
"calib/step_conf_rate": 0.06640625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 692.18359375,
"completions/mean_terminated_length": 767.09521484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.21704789996147156,
"learning_rate": 4.75e-06,
"loss": -0.8478,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0006637288024649024,
"mask/share_reasoning": 0.8980405926704407,
"mask/share_step_conf": 0.0036394214257597923,
"num_tokens": 7858880.0,
"reward": 0.05586779862642288,
"reward_std": 0.1327630877494812,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.018447261303663254,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.028293216601014137,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.0374080091714859,
"adv/mean_abs_reasoning": 0.07730154693126678,
"adv/mean_abs_step_conf": 0.05792953446507454,
"adv/ratio_final_to_reasoning": 0.4839231639794673,
"adv/ratio_step_to_reasoning": 0.7493968331136115,
"adv/std_final_conf": 0.22671973705291748,
"adv/std_reasoning": 0.33120280504226685,
"adv/std_step_conf": 0.28659942746162415,
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.3888888888888889,
"calib/avg_num_step_conf": 0.29296875,
"calib/ece": 0.823,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.03000000000000025,
"calib/mean_conf": 0.9229999999999998,
"calib/mu_c": 0.95,
"calib/mu_w": 0.9199999999999997,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.823,
"calib/std_conf": 0.11550324670761418,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 707.390625,
"completions/mean_terminated_length": 780.5689697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.032,
"grad_norm": 0.09119506180286407,
"learning_rate": 4.722222222222222e-06,
"loss": -0.263,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.001026460900902748,
"mask/share_reasoning": 0.9018102884292603,
"mask/share_step_conf": 0.0034132516011595726,
"num_tokens": 8146956.0,
"reward": 0.012917187064886093,
"reward_std": 0.03653532266616821,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0004585937422234565,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.006073827389627695,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.10018590092658997,
"adv/mean_abs_reasoning": 0.10260053724050522,
"adv/mean_abs_step_conf": 0.10107364505529404,
"adv/ratio_final_to_reasoning": 0.9764656562347708,
"adv/ratio_step_to_reasoning": 0.9851180878163239,
"adv/std_final_conf": 0.3672908544540405,
"adv/std_reasoning": 0.3702797293663025,
"adv/std_step_conf": 0.3699702024459839,
"calib/answer_extract_rate": 0.03515625,
"calib/avg_num_step_conf": 0.10546875,
"calib/ece": 0.9275,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.875,
"calib/mean_conf": 0.9275,
"calib/mu_c": NaN,
"calib/mu_w": 0.9275,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.9275,
"calib/std_conf": 0.050682837331783206,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 728.1015625,
"completions/mean_terminated_length": 813.9476318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.10365401953458786,
"learning_rate": 4.694444444444445e-06,
"loss": -0.7181,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.001135041005909443,
"mask/share_reasoning": 0.8914518356323242,
"mask/share_step_conf": 0.0019443880300968885,
"num_tokens": 8439262.0,
"reward": 0.0171560849994421,
"reward_std": 0.0432112030684948,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0035996094811707735,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.009074865840375423,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.1154562383890152,
"adv/mean_abs_reasoning": 0.11964812874794006,
"adv/mean_abs_step_conf": 0.11882726103067398,
"adv/ratio_final_to_reasoning": 0.9649648481527378,
"adv/ratio_step_to_reasoning": 0.9931393183841982,
"adv/std_final_conf": 0.4041202664375305,
"adv/std_reasoning": 0.40568676590919495,
"adv/std_step_conf": 0.40552806854248047,
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.25925925925925924,
"calib/avg_num_step_conf": 0.14453125,
"calib/ece": 0.6475000000000001,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": 0.07444444444444431,
"calib/mean_conf": 0.8975,
"calib/mu_c": 0.9533333333333333,
"calib/mu_w": 0.8788888888888889,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.6475000000000001,
"calib/std_conf": 0.18046814123273947,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 669.89453125,
"completions/mean_terminated_length": 742.3939208984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.20890480279922485,
"learning_rate": 4.666666666666667e-06,
"loss": -0.7214,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0010919722262769938,
"mask/share_reasoning": 0.8988701701164246,
"mask/share_step_conf": 0.0023815971799194813,
"num_tokens": 8717459.0,
"reward": 0.041473038494586945,
"reward_std": 0.10357101261615753,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.017754295840859413,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.01938142627477646,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.14984360337257385,
"adv/mean_abs_reasoning": 0.19187571108341217,
"adv/mean_abs_step_conf": 0.1522672474384308,
"adv/ratio_final_to_reasoning": 0.7809409670796419,
"adv/ratio_step_to_reasoning": 0.7935722899926463,
"adv/std_final_conf": 0.4321545660495758,
"adv/std_reasoning": 0.49684324860572815,
"adv/std_step_conf": 0.4377925992012024,
"calib/answer_extract_rate": 0.0859375,
"calib/auroc": 0.6153846153846154,
"calib/avg_num_step_conf": 0.2109375,
"calib/ece": 0.8000000000000002,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.03076923076923055,
"calib/mean_conf": 0.9333333333333333,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9292307692307694,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.8000000000000002,
"calib/std_conf": 0.057696524062450134,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 724.015625,
"completions/mean_terminated_length": 782.0590209960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0352,
"grad_norm": 0.13100853562355042,
"learning_rate": 4.638888888888889e-06,
"loss": -1.0953,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.0013625889550894499,
"mask/share_reasoning": 0.9213559627532959,
"mask/share_step_conf": 0.0030626384541392326,
"num_tokens": 9009679.0,
"reward": 0.04265952855348587,
"reward_std": 0.10281442105770111,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.008678124286234379,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.01847999542951584,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.19780707359313965,
"adv/mean_abs_reasoning": 0.20779672265052795,
"adv/mean_abs_step_conf": 0.20860755443572998,
"adv/ratio_final_to_reasoning": 0.9519258584545202,
"adv/ratio_step_to_reasoning": 1.0039020431836438,
"adv/std_final_conf": 0.5196132063865662,
"adv/std_reasoning": 0.5236744284629822,
"adv/std_step_conf": 0.5234167575836182,
"calib/answer_extract_rate": 0.09375,
"calib/auroc": 0.5294117647058824,
"calib/avg_num_step_conf": 0.265625,
"calib/ece": 0.7855555555555556,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.07294117647058829,
"calib/mean_conf": 0.8411111111111111,
"calib/mu_c": 0.91,
"calib/mu_w": 0.8370588235294117,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.11328125,
"calib/nonempty_step_conf_rate": 0.078125,
"calib/pce": 0.7855555555555556,
"calib/std_conf": 0.21931430740400581,
"calib/step_conf_rate": 0.078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12890625,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 643.7578125,
"completions/mean_terminated_length": 739.0224609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.12947340309619904,
"learning_rate": 4.611111111111112e-06,
"loss": -0.9669,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.001347921323031187,
"mask/share_reasoning": 0.865627110004425,
"mask/share_step_conf": 0.004118745215237141,
"num_tokens": 9279593.0,
"reward": 0.048354651778936386,
"reward_std": 0.11566583812236786,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.015407422557473183,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.030411846935749054,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.10604438185691833,
"adv/mean_abs_reasoning": 0.14723995327949524,
"adv/mean_abs_step_conf": 0.10678407549858093,
"adv/ratio_final_to_reasoning": 0.7202147209027006,
"adv/ratio_step_to_reasoning": 0.72523845002776,
"adv/std_final_conf": 0.36649808287620544,
"adv/std_reasoning": 0.43816131353378296,
"adv/std_step_conf": 0.3700529932975769,
"calib/answer_extract_rate": 0.06640625,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.22265625,
"calib/ece": 0.8400000000000001,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.05555555555555558,
"calib/mean_conf": 0.9400000000000002,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9344444444444444,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.08203125,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.8400000000000001,
"calib/std_conf": 0.051185935568278884,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 760.9296875,
"completions/mean_terminated_length": 832.4701538085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.12047702819108963,
"learning_rate": 4.583333333333333e-06,
"loss": -0.4872,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.000588306924328208,
"mask/share_reasoning": 0.9106691479682922,
"mask/share_step_conf": 0.002805058378726244,
"num_tokens": 9583647.0,
"reward": 0.028987498953938484,
"reward_std": 0.07016594707965851,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0037542972713708878,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.01312909834086895,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.15778377652168274,
"adv/mean_abs_reasoning": 0.1813150942325592,
"adv/mean_abs_step_conf": 0.1688225269317627,
"adv/ratio_final_to_reasoning": 0.870218649966921,
"adv/ratio_step_to_reasoning": 0.9311002354565515,
"adv/std_final_conf": 0.4654620587825775,
"adv/std_reasoning": 0.49686798453330994,
"adv/std_step_conf": 0.4681243300437927,
"calib/answer_extract_rate": 0.0859375,
"calib/auroc": 0.4363636363636364,
"calib/avg_num_step_conf": 0.1875,
"calib/ece": 0.5506250000000001,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.5625,
"calib/gap": 0.04127272727272713,
"calib/mean_conf": 0.775625,
"calib/mu_c": 0.8039999999999999,
"calib/mu_w": 0.7627272727272728,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.5068750000000002,
"calib/std_conf": 0.32256721683239914,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2910.0,
"completions/max_terminated_length": 2910.0,
"completions/mean_length": 625.63671875,
"completions/mean_terminated_length": 672.9537963867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.0384,
"grad_norm": 0.12413109093904495,
"learning_rate": 4.555555555555556e-06,
"loss": -1.0095,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.0008500913972966373,
"mask/share_reasoning": 0.9263497591018677,
"mask/share_step_conf": 0.0024876415263861418,
"num_tokens": 9846522.0,
"reward": 0.06386526674032211,
"reward_std": 0.1407494843006134,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.02153554931282997,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.031025253236293793,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.07668406516313553,
"adv/mean_abs_reasoning": 0.10092823207378387,
"adv/mean_abs_step_conf": 0.07726116478443146,
"adv/ratio_final_to_reasoning": 0.7597880552101163,
"adv/ratio_step_to_reasoning": 0.765505975849745,
"adv/std_final_conf": 0.3285629451274872,
"adv/std_reasoning": 0.3703406751155853,
"adv/std_step_conf": 0.33102983236312866,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.16666666666666669,
"calib/avg_num_step_conf": 0.08203125,
"calib/ece": 0.7225,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": -0.35000000000000003,
"calib/mean_conf": 0.7225,
"calib/mu_c": 0.46,
"calib/mu_w": 0.81,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.5975,
"calib/std_conf": 0.3910482706776748,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 597.72265625,
"completions/mean_terminated_length": 695.5317993164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.09366196393966675,
"learning_rate": 4.527777777777778e-06,
"loss": -0.563,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.00032904170802794397,
"mask/share_reasoning": 0.8579970598220825,
"mask/share_step_conf": 0.00104893883690238,
"num_tokens": 10106635.0,
"reward": 0.025437016040086746,
"reward_std": 0.06532414257526398,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.005510937422513962,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.009056063368916512,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.0598284974694252,
"adv/mean_abs_reasoning": 0.10520166158676147,
"adv/mean_abs_step_conf": 0.08477747440338135,
"adv/ratio_final_to_reasoning": 0.5687029707233635,
"adv/ratio_step_to_reasoning": 0.8058568004029483,
"adv/std_final_conf": 0.28494229912757874,
"adv/std_reasoning": 0.3703286945819855,
"adv/std_step_conf": 0.3308514952659607,
"calib/answer_extract_rate": 0.046875,
"calib/auroc": 0.29166666666666663,
"calib/avg_num_step_conf": 0.17578125,
"calib/ece": 0.7075,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.875,
"calib/gap": -0.0033333333333334103,
"calib/mean_conf": 0.9575,
"calib/mu_c": 0.955,
"calib/mu_w": 0.9583333333333334,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.7075,
"calib/std_conf": 0.02586020108197149,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 744.63671875,
"completions/mean_terminated_length": 811.1787109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.0911625474691391,
"learning_rate": 4.5e-06,
"loss": -0.5374,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0005922014825046062,
"mask/share_reasoning": 0.9159488677978516,
"mask/share_step_conf": 0.0014276672154664993,
"num_tokens": 10404150.0,
"reward": 0.025644494220614433,
"reward_std": 0.05945397913455963,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.005485547240823507,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.00948371458798647,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.07660110294818878,
"adv/mean_abs_reasoning": 0.08327651768922806,
"adv/mean_abs_step_conf": 0.08296199142932892,
"adv/ratio_final_to_reasoning": 0.9198403712562689,
"adv/ratio_step_to_reasoning": 0.9962231098438471,
"adv/std_final_conf": 0.32866036891937256,
"adv/std_reasoning": 0.33119046688079834,
"adv/std_step_conf": 0.3309943675994873,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.1328125,
"calib/ece": 0.758,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.6,
"calib/mean_conf": 0.758,
"calib/mu_c": NaN,
"calib/mu_w": 0.758,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.03125,
"calib/pce": 0.758,
"calib/std_conf": 0.3121794355815258,
"calib/step_conf_rate": 0.03125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 656.76953125,
"completions/mean_terminated_length": 697.6473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0416,
"grad_norm": 0.0807473286986351,
"learning_rate": 4.472222222222223e-06,
"loss": -0.4533,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.00039599655428901315,
"mask/share_reasoning": 0.9395262002944946,
"mask/share_step_conf": 0.0014840353978797793,
"num_tokens": 10678371.0,
"reward": 0.016320213675498962,
"reward_std": 0.03991464152932167,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.005438671912997961,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.010389840230345726,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.02359767258167267,
"adv/mean_abs_reasoning": 0.025304459035396576,
"adv/mean_abs_step_conf": 0.0234784297645092,
"adv/ratio_final_to_reasoning": 0.932549972661482,
"adv/ratio_step_to_reasoning": 0.9278376483633547,
"adv/std_final_conf": 0.16475501656532288,
"adv/std_reasoning": 0.16561181843280792,
"adv/std_step_conf": 0.1655532270669937,
"calib/answer_extract_rate": 0.0390625,
"calib/avg_num_step_conf": 0.015625,
"calib/ece": 0.6166666666666667,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.6166666666666667,
"calib/mu_c": NaN,
"calib/mu_w": 0.6166666666666667,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.6166666666666667,
"calib/std_conf": 0.4365266951236265,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3042.0,
"completions/max_terminated_length": 3042.0,
"completions/mean_length": 686.71484375,
"completions/mean_terminated_length": 767.6812133789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.1018519401550293,
"learning_rate": 4.444444444444444e-06,
"loss": -0.1728,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0006191778229549527,
"mask/share_reasoning": 0.8933592438697815,
"mask/share_step_conf": 0.0005528143374249339,
"num_tokens": 10960930.0,
"reward": 0.0061796484515070915,
"reward_std": 0.011564082466065884,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0011230468517169356,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.003985273651778698,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.07689839601516724,
"adv/mean_abs_reasoning": 0.10181869566440582,
"adv/mean_abs_step_conf": 0.10532698780298233,
"adv/ratio_final_to_reasoning": 0.755248292205826,
"adv/ratio_step_to_reasoning": 1.0344562667561548,
"adv/std_final_conf": 0.3298797309398651,
"adv/std_reasoning": 0.3703538179397583,
"adv/std_step_conf": 0.370156466960907,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.19444444444444442,
"calib/avg_num_step_conf": 0.3515625,
"calib/ece": 0.6922222222222222,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.7777777777777778,
"calib/gap": -0.25,
"calib/mean_conf": 0.8033333333333332,
"calib/mu_c": 0.6366666666666666,
"calib/mu_w": 0.8866666666666666,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.5811111111111111,
"calib/std_conf": 0.31843366656181316,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 634.71484375,
"completions/mean_terminated_length": 725.388427734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.05635469779372215,
"learning_rate": 4.416666666666667e-06,
"loss": -0.4841,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0004555766936391592,
"mask/share_reasoning": 0.8720616102218628,
"mask/share_step_conf": 0.0024828272871673107,
"num_tokens": 11230665.0,
"reward": 0.03794757276773453,
"reward_std": 0.09024789929389954,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.011569530703127384,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.01932913064956665,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.10751166194677353,
"adv/mean_abs_reasoning": 0.10858826339244843,
"adv/mean_abs_step_conf": 0.10843968391418457,
"adv/ratio_final_to_reasoning": 0.9900854713756315,
"adv/ratio_step_to_reasoning": 0.9986317169680956,
"adv/std_final_conf": 0.3666759729385376,
"adv/std_reasoning": 0.3703175485134125,
"adv/std_step_conf": 0.3700626790523529,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.18359375,
"calib/ece": 0.8477777777777777,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.02124999999999999,
"calib/mean_conf": 0.9588888888888888,
"calib/mu_c": 0.94,
"calib/mu_w": 0.9612499999999999,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.8477777777777777,
"calib/std_conf": 0.007370277311900895,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 2989.0,
"completions/max_terminated_length": 2989.0,
"completions/mean_length": 557.27734375,
"completions/mean_terminated_length": 622.9825439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0448,
"grad_norm": 0.0798545628786087,
"learning_rate": 4.388888888888889e-06,
"loss": -0.6422,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0004774858825840056,
"mask/share_reasoning": 0.8922837972640991,
"mask/share_step_conf": 0.0017699991585686803,
"num_tokens": 11477696.0,
"reward": 0.025472892448306084,
"reward_std": 0.061406031250953674,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.005729687865823507,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.012924695387482643,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.10083452612161636,
"adv/mean_abs_reasoning": 0.10260053724050522,
"adv/mean_abs_step_conf": 0.10188859701156616,
"adv/ratio_final_to_reasoning": 0.9827875061243669,
"adv/ratio_step_to_reasoning": 0.993061047748023,
"adv/std_final_conf": 0.36501818895339966,
"adv/std_reasoning": 0.3702797293663025,
"adv/std_step_conf": 0.3699541687965393,
"calib/answer_extract_rate": 0.046875,
"calib/avg_num_step_conf": 0.10546875,
"calib/ece": 0.9424999999999999,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.875,
"calib/mean_conf": 0.9424999999999999,
"calib/mu_c": NaN,
"calib/mu_w": 0.9424999999999999,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.9424999999999999,
"calib/std_conf": 0.05402545696243574,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 674.1484375,
"completions/mean_terminated_length": 750.3565063476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.11436983942985535,
"learning_rate": 4.361111111111112e-06,
"loss": -0.8247,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0002926693414337933,
"mask/share_reasoning": 0.8972545862197876,
"mask/share_step_conf": 0.0008902625413611531,
"num_tokens": 11755502.0,
"reward": 0.016670772805809975,
"reward_std": 0.04176222160458565,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0016867187805473804,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.009060687385499477,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.08186276257038116,
"adv/mean_abs_reasoning": 0.08327651768922806,
"adv/mean_abs_step_conf": 0.08257116377353668,
"adv/ratio_final_to_reasoning": 0.9830233641118046,
"adv/ratio_step_to_reasoning": 0.991529978254811,
"adv/std_final_conf": 0.32681629061698914,
"adv/std_reasoning": 0.33119046688079834,
"adv/std_step_conf": 0.33087316155433655,
"calib/answer_extract_rate": 0.0546875,
"calib/avg_num_step_conf": 0.09765625,
"calib/ece": 0.96,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.96,
"calib/mu_c": NaN,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.96,
"calib/std_conf": 0.005773502691896263,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 726.59375,
"completions/mean_terminated_length": 812.2620239257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.1150890588760376,
"learning_rate": 4.333333333333334e-06,
"loss": -0.5969,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0002706256345845759,
"mask/share_reasoning": 0.8929933905601501,
"mask/share_step_conf": 0.0012672271113842726,
"num_tokens": 12047830.0,
"reward": 0.014166897162795067,
"reward_std": 0.034422408789396286,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0014558595139533281,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.008074614219367504,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.0662812888622284,
"adv/mean_abs_reasoning": 0.08627735078334808,
"adv/mean_abs_step_conf": 0.0632265955209732,
"adv/ratio_final_to_reasoning": 0.7682350960064595,
"adv/ratio_step_to_reasoning": 0.7328295890742188,
"adv/std_final_conf": 0.2857547700405121,
"adv/std_reasoning": 0.3312488794326782,
"adv/std_step_conf": 0.28668659925460815,
"calib/answer_extract_rate": 0.0390625,
"calib/auroc": 0.8333333333333334,
"calib/avg_num_step_conf": 0.08984375,
"calib/ece": 0.729,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.15666666666666662,
"calib/mean_conf": 0.8290000000000001,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8133333333333334,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.729,
"calib/std_conf": 0.28605768648998053,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 719.265625,
"completions/mean_terminated_length": 770.4267578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.048,
"grad_norm": 0.09593694657087326,
"learning_rate": 4.305555555555556e-06,
"loss": -0.5089,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0005101782153360546,
"mask/share_reasoning": 0.9324560761451721,
"mask/share_step_conf": 0.0006274882471188903,
"num_tokens": 12337010.0,
"reward": 0.02451431378722191,
"reward_std": 0.057681601494550705,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.008652344346046448,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.009546207264065742,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.058217599987983704,
"adv/mean_abs_reasoning": 0.06654815375804901,
"adv/mean_abs_step_conf": 0.06974774599075317,
"adv/ratio_final_to_reasoning": 0.8748191602677223,
"adv/ratio_step_to_reasoning": 1.0480793538516036,
"adv/std_final_conf": 0.28529322147369385,
"adv/std_reasoning": 0.28685876727104187,
"adv/std_step_conf": 0.2865889370441437,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.07421875,
"calib/ece": 0.6372222222222221,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.19933333333333336,
"calib/mean_conf": 0.8038888888888889,
"calib/mu_c": 0.97,
"calib/mu_w": 0.7706666666666666,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.6372222222222221,
"calib/std_conf": 0.23014018808726022,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 656.640625,
"completions/mean_terminated_length": 734.0611572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.08373303711414337,
"learning_rate": 4.277777777777778e-06,
"loss": -0.481,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0002997597330249846,
"mask/share_reasoning": 0.8933658599853516,
"mask/share_step_conf": 0.0008656186982989311,
"num_tokens": 12609878.0,
"reward": 0.019132791087031364,
"reward_std": 0.040449466556310654,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.007249609567224979,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.0072970278561115265,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.07674593478441238,
"adv/mean_abs_reasoning": 0.07731065154075623,
"adv/mean_abs_step_conf": 0.07728387415409088,
"adv/ratio_final_to_reasoning": 0.9926954857436929,
"adv/ratio_step_to_reasoning": 0.9996536391023528,
"adv/std_final_conf": 0.32882946729660034,
"adv/std_reasoning": 0.3312418460845947,
"adv/std_step_conf": 0.3311271071434021,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.078125,
"calib/ece": 0.6266666666666667,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.96,
"calib/mu_c": 0.96,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.6266666666666667,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 710.0,
"completions/mean_terminated_length": 793.7117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.09604458510875702,
"learning_rate": 4.25e-06,
"loss": -0.5308,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.00019567902199923992,
"mask/share_reasoning": 0.8936618566513062,
"mask/share_step_conf": 0.0006737294606864452,
"num_tokens": 12897614.0,
"reward": 0.023951200768351555,
"reward_std": 0.06774422526359558,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.008412499912083149,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.012446149252355099,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.08069179952144623,
"adv/mean_abs_reasoning": 0.10261328518390656,
"adv/mean_abs_step_conf": 0.08180000633001328,
"adv/ratio_final_to_reasoning": 0.7863679578801908,
"adv/ratio_step_to_reasoning": 0.7971677954117626,
"adv/std_final_conf": 0.32742586731910706,
"adv/std_reasoning": 0.3703286051750183,
"adv/std_step_conf": 0.33102738857269287,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.2265625,
"calib/ece": 0.7883333333333333,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": -0.06599999999999995,
"calib/mean_conf": 0.955,
"calib/mu_c": 0.9,
"calib/mu_w": 0.966,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.7883333333333333,
"calib/std_conf": 0.025658007197234402,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 689.80859375,
"completions/mean_terminated_length": 767.7869262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0512,
"grad_norm": 0.052539192140102386,
"learning_rate": 4.222222222222223e-06,
"loss": -0.5068,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0003127142845187336,
"mask/share_reasoning": 0.896579384803772,
"mask/share_step_conf": 0.001545402454212308,
"num_tokens": 13177893.0,
"reward": 0.0242399163544178,
"reward_std": 0.06236550211906433,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.00486523425206542,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.010890969075262547,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.07648647576570511,
"adv/mean_abs_reasoning": 0.07989173382520676,
"adv/mean_abs_step_conf": 0.08035692572593689,
"adv/ratio_final_to_reasoning": 0.9573765908378965,
"adv/ratio_step_to_reasoning": 1.0058227788840823,
"adv/std_final_conf": 0.32796990871429443,
"adv/std_reasoning": 0.33121076226234436,
"adv/std_step_conf": 0.3310745358467102,
"calib/answer_extract_rate": 0.03515625,
"calib/auroc": 0.6,
"calib/avg_num_step_conf": 0.1796875,
"calib/ece": 0.7916666666666667,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0020000000000000018,
"calib/mean_conf": 0.9583333333333334,
"calib/mu_c": 0.96,
"calib/mu_w": 0.958,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.7916666666666667,
"calib/std_conf": 0.003726779962499652,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 641.75390625,
"completions/mean_terminated_length": 687.4016723632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.09106607735157013,
"learning_rate": 4.194444444444445e-06,
"loss": -0.5967,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.00034151755971834064,
"mask/share_reasoning": 0.9314118027687073,
"mask/share_step_conf": 0.0018404647707939148,
"num_tokens": 13446718.0,
"reward": 0.021333744749426842,
"reward_std": 0.05266954004764557,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.005199609324336052,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.012723932042717934,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.13274917006492615,
"adv/mean_abs_reasoning": 0.1391177475452423,
"adv/mean_abs_step_conf": 0.13292866945266724,
"adv/ratio_final_to_reasoning": 0.9542216748567968,
"adv/ratio_step_to_reasoning": 0.9555119443652411,
"adv/std_final_conf": 0.40295490622520447,
"adv/std_reasoning": 0.4056834280490875,
"adv/std_step_conf": 0.40546315908432007,
"calib/answer_extract_rate": 0.0703125,
"calib/auroc": 0.5555555555555556,
"calib/avg_num_step_conf": 0.21484375,
"calib/ece": 0.6661538461538459,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8461538461538461,
"calib/gap": -0.029722222222222427,
"calib/mean_conf": 0.9430769230769229,
"calib/mu_c": 0.9224999999999999,
"calib/mu_w": 0.9522222222222223,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.0859375,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.6507692307692305,
"calib/std_conf": 0.044442636386628556,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2974.0,
"completions/max_terminated_length": 2974.0,
"completions/mean_length": 661.9453125,
"completions/mean_terminated_length": 730.4224243164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.15760673582553864,
"learning_rate": 4.166666666666667e-06,
"loss": -0.9254,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0007866570958867669,
"mask/share_reasoning": 0.9025189876556396,
"mask/share_step_conf": 0.002944336738437414,
"num_tokens": 13721536.0,
"reward": 0.049756817519664764,
"reward_std": 0.09445478022098541,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.014207031577825546,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.022097613662481308,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.05740310996770859,
"adv/mean_abs_reasoning": 0.07730336487293243,
"adv/mean_abs_step_conf": 0.07719512283802032,
"adv/ratio_final_to_reasoning": 0.7425693572597398,
"adv/ratio_step_to_reasoning": 0.9985997758947488,
"adv/std_final_conf": 0.2839992642402649,
"adv/std_reasoning": 0.3312106132507324,
"adv/std_step_conf": 0.3307470679283142,
"calib/answer_extract_rate": 0.01953125,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.9199999999999999,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": -0.8933333333333332,
"calib/mean_conf": 0.6699999999999999,
"calib/mu_c": 0.0,
"calib/mu_w": 0.8933333333333332,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.6699999999999999,
"calib/std_conf": 0.3953479480154159,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 640.1328125,
"completions/mean_terminated_length": 718.74560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0544,
"grad_norm": 0.1499822437763214,
"learning_rate": 4.138888888888889e-06,
"loss": -0.5292,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.00039907536120153964,
"mask/share_reasoning": 0.8892779350280762,
"mask/share_step_conf": 0.0009479941800236702,
"num_tokens": 13994706.0,
"reward": 0.015216129831969738,
"reward_std": 0.04303771257400513,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.002262500114738941,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.005863510072231293,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.09578309953212738,
"adv/mean_abs_reasoning": 0.0966200977563858,
"adv/mean_abs_step_conf": 0.09649413079023361,
"adv/ratio_final_to_reasoning": 0.9913372244109212,
"adv/ratio_step_to_reasoning": 0.9986962653829041,
"adv/std_final_conf": 0.36706623435020447,
"adv/std_reasoning": 0.3702698349952698,
"adv/std_step_conf": 0.36978742480278015,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.078125,
"calib/ece": 0.888,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.8,
"calib/mean_conf": 0.8880000000000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.8880000000000001,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.888,
"calib/std_conf": 0.11016351483136327,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 3016.0,
"completions/max_terminated_length": 3016.0,
"completions/mean_length": 734.859375,
"completions/mean_terminated_length": 825.1052856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.15719400346279144,
"learning_rate": 4.111111111111111e-06,
"loss": -0.5916,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0002719433978199959,
"mask/share_reasoning": 0.8888888359069824,
"mask/share_step_conf": 0.0014642456080764532,
"num_tokens": 14290782.0,
"reward": 0.01490075420588255,
"reward_std": 0.04214569926261902,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.003892968874424696,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.008323774673044682,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.0574328675866127,
"adv/mean_abs_reasoning": 0.05797934532165527,
"adv/mean_abs_step_conf": 0.05788220465183258,
"adv/ratio_final_to_reasoning": 0.9905746135626257,
"adv/ratio_step_to_reasoning": 0.998324564217071,
"adv/std_final_conf": 0.2841477394104004,
"adv/std_reasoning": 0.2868458330631256,
"adv/std_step_conf": 0.28636565804481506,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.6666666666666667,
"calib/avg_num_step_conf": 0.046875,
"calib/ece": 0.5525,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.20999999999999996,
"calib/mean_conf": 0.8025,
"calib/mu_c": 0.96,
"calib/mu_w": 0.75,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.5525,
"calib/std_conf": 0.27279800219209815,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3034.0,
"completions/max_terminated_length": 3034.0,
"completions/mean_length": 721.37890625,
"completions/mean_terminated_length": 789.2008666992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.08934785425662994,
"learning_rate": 4.083333333333334e-06,
"loss": -0.4098,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0001739041181281209,
"mask/share_reasoning": 0.913271427154541,
"mask/share_step_conf": 0.0006171360146254301,
"num_tokens": 14581279.0,
"reward": 0.0136606115847826,
"reward_std": 0.03863804414868355,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.004512500017881393,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.005533724091947079,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.12810653448104858,
"adv/mean_abs_reasoning": 0.15454918146133423,
"adv/mean_abs_step_conf": 0.13039252161979675,
"adv/ratio_final_to_reasoning": 0.8289046455616383,
"adv/ratio_step_to_reasoning": 0.8436959703498585,
"adv/std_final_conf": 0.4014511704444885,
"adv/std_reasoning": 0.43816977739334106,
"adv/std_step_conf": 0.4052298665046692,
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.6777777777777778,
"calib/avg_num_step_conf": 0.1875,
"calib/ece": 0.7527777777777777,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": 0.060666666666666424,
"calib/mean_conf": 0.9194444444444445,
"calib/mu_c": 0.9700000000000001,
"calib/mu_w": 0.9093333333333337,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.7527777777777777,
"calib/std_conf": 0.15668341124070756,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 619.06640625,
"completions/mean_terminated_length": 677.269287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0576,
"grad_norm": 0.0982208400964737,
"learning_rate": 4.055555555555556e-06,
"loss": -0.6749,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.0009278756915591657,
"mask/share_reasoning": 0.9105731248855591,
"mask/share_step_conf": 0.0025614770129323006,
"num_tokens": 14845992.0,
"reward": 0.04713316261768341,
"reward_std": 0.08477863669395447,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.010382031090557575,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.01876281015574932,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.08847086131572723,
"adv/mean_abs_reasoning": 0.09186718612909317,
"adv/mean_abs_step_conf": 0.09058874100446701,
"adv/ratio_final_to_reasoning": 0.9630300550558567,
"adv/ratio_step_to_reasoning": 0.9860837674637202,
"adv/std_final_conf": 0.3300953507423401,
"adv/std_reasoning": 0.3312879204750061,
"adv/std_step_conf": 0.3310956656932831,
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.5833333333333334,
"calib/avg_num_step_conf": 0.1484375,
"calib/ece": 0.483,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.1283333333333333,
"calib/mean_conf": 0.883,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8316666666666667,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.03125,
"calib/pce": 0.483,
"calib/std_conf": 0.22777401080895948,
"calib/step_conf_rate": 0.03125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2953.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 573.73046875,
"completions/mean_terminated_length": 655.6920166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.047732096165418625,
"learning_rate": 4.027777777777779e-06,
"loss": -0.5643,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0008438924560323358,
"mask/share_reasoning": 0.8722988367080688,
"mask/share_step_conf": 0.0018572770059108734,
"num_tokens": 15100691.0,
"reward": 0.04101718217134476,
"reward_std": 0.0868811085820198,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.016591796651482582,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.01514472346752882,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.11497887969017029,
"adv/mean_abs_reasoning": 0.13528089225292206,
"adv/mean_abs_step_conf": 0.11585846543312073,
"adv/ratio_final_to_reasoning": 0.8499269762000461,
"adv/ratio_step_to_reasoning": 0.8564288977079703,
"adv/std_final_conf": 0.4022437632083893,
"adv/std_reasoning": 0.43815046548843384,
"adv/std_step_conf": 0.40531063079833984,
"calib/answer_extract_rate": 0.0546875,
"calib/auroc": 0.7083333333333334,
"calib/avg_num_step_conf": 0.2109375,
"calib/ece": 0.64625,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.11166666666666669,
"calib/mean_conf": 0.89625,
"calib/mu_c": 0.98,
"calib/mu_w": 0.8683333333333333,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.64625,
"calib/std_conf": 0.15889756920733558,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 619.46875,
"completions/mean_terminated_length": 660.7667236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.1226649284362793,
"learning_rate": 4.000000000000001e-06,
"loss": -0.879,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.00035208818735554814,
"mask/share_reasoning": 0.934766411781311,
"mask/share_step_conf": 0.002381462138146162,
"num_tokens": 15366115.0,
"reward": 0.02765616402029991,
"reward_std": 0.07822343707084656,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.008952734060585499,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.011773454956710339,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.1396850049495697,
"adv/mean_abs_reasoning": 0.14115017652511597,
"adv/mean_abs_step_conf": 0.1439182162284851,
"adv/ratio_final_to_reasoning": 0.9896197680256846,
"adv/ratio_step_to_reasoning": 1.0196106003656085,
"adv/std_final_conf": 0.43321725726127625,
"adv/std_reasoning": 0.43815895915031433,
"adv/std_step_conf": 0.4378523528575897,
"calib/answer_extract_rate": 0.046875,
"calib/auroc": 0.5208333333333333,
"calib/avg_num_step_conf": 0.19140625,
"calib/ece": 0.6118181818181818,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.10833333333333339,
"calib/mean_conf": 0.8845454545454546,
"calib/mu_c": 0.9633333333333334,
"calib/mu_w": 0.855,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.6118181818181818,
"calib/std_conf": 0.2485727856223728,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2877.0,
"completions/max_terminated_length": 2877.0,
"completions/mean_length": 632.21484375,
"completions/mean_terminated_length": 674.362548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0608,
"grad_norm": 0.09539405256509781,
"learning_rate": 3.972222222222223e-06,
"loss": -1.0366,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0004407475353218615,
"mask/share_reasoning": 0.9352881908416748,
"mask/share_step_conf": 0.0017710895044729114,
"num_tokens": 15634754.0,
"reward": 0.04146331921219826,
"reward_std": 0.09492677450180054,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.013380076736211777,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.017642855644226074,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.056917693465948105,
"adv/mean_abs_reasoning": 0.05797205865383148,
"adv/mean_abs_step_conf": 0.05793406441807747,
"adv/ratio_final_to_reasoning": 0.9818125280977288,
"adv/ratio_step_to_reasoning": 0.9993446112379606,
"adv/std_final_conf": 0.28159940242767334,
"adv/std_reasoning": 0.2868097722530365,
"adv/std_step_conf": 0.2866218388080597,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.109375,
"calib/ece": 0.835,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.75,
"calib/mean_conf": 0.835,
"calib/mu_c": NaN,
"calib/mu_w": 0.835,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.835,
"calib/std_conf": 0.3163463292026636,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 2593.0,
"completions/max_terminated_length": 2593.0,
"completions/mean_length": 530.86328125,
"completions/mean_terminated_length": 593.4541625976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.036493152379989624,
"learning_rate": 3.944444444444445e-06,
"loss": -0.387,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.000371251895558089,
"mask/share_reasoning": 0.8929678797721863,
"mask/share_step_conf": 0.0011921343393623829,
"num_tokens": 15876975.0,
"reward": 0.00919120479375124,
"reward_std": 0.025996655225753784,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0007671875064261258,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.006280066911131144,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.09573530405759811,
"adv/mean_abs_reasoning": 0.09922303259372711,
"adv/mean_abs_step_conf": 0.10180923342704773,
"adv/ratio_final_to_reasoning": 0.9648496075461668,
"adv/ratio_step_to_reasoning": 1.026064521167277,
"adv/std_final_conf": 0.36712777614593506,
"adv/std_reasoning": 0.37032580375671387,
"adv/std_step_conf": 0.37004950642585754,
"calib/answer_extract_rate": 0.0546875,
"calib/auroc": 0.4375,
"calib/avg_num_step_conf": 0.1484375,
"calib/ece": 0.6970000000000001,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.07874999999999999,
"calib/mean_conf": 0.897,
"calib/mu_c": 0.96,
"calib/mu_w": 0.88125,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.6970000000000001,
"calib/std_conf": 0.19910047714659046,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 568.78125,
"completions/mean_terminated_length": 624.9270629882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.11433440446853638,
"learning_rate": 3.916666666666667e-06,
"loss": -0.618,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0011701658368110657,
"mask/share_reasoning": 0.9075009226799011,
"mask/share_step_conf": 0.0014852045569568872,
"num_tokens": 16128831.0,
"reward": 0.02821742370724678,
"reward_std": 0.07275127619504929,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.012121873907744884,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.01131141185760498,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.05764734745025635,
"adv/mean_abs_reasoning": 0.08159875124692917,
"adv/mean_abs_step_conf": 0.07725159823894501,
"adv/ratio_final_to_reasoning": 0.706473402709405,
"adv/ratio_step_to_reasoning": 0.946725250796681,
"adv/std_final_conf": 0.2852080762386322,
"adv/std_reasoning": 0.33123520016670227,
"adv/std_step_conf": 0.33098888397216797,
"calib/answer_extract_rate": 0.046875,
"calib/auroc": 0.41666666666666663,
"calib/avg_num_step_conf": 0.0625,
"calib/ece": 0.6962499999999999,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.875,
"calib/gap": 0.018333333333333424,
"calib/mean_conf": 0.94625,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9416666666666665,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.6962499999999999,
"calib/std_conf": 0.0567753247458788,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 649.25390625,
"completions/mean_terminated_length": 707.2723388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.064,
"grad_norm": 0.136693075299263,
"learning_rate": 3.88888888888889e-06,
"loss": -0.4638,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0004379056626930833,
"mask/share_reasoning": 0.9162116050720215,
"mask/share_step_conf": 0.0013192400801926851,
"num_tokens": 16403896.0,
"reward": 0.021914906799793243,
"reward_std": 0.05458424985408783,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.005612500011920929,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.009773565456271172,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.06954145431518555,
"adv/mean_abs_reasoning": 0.0892697125673294,
"adv/mean_abs_step_conf": 0.06976863741874695,
"adv/ratio_final_to_reasoning": 0.7790039008217449,
"adv/ratio_step_to_reasoning": 0.7815488076779202,
"adv/std_final_conf": 0.28522348403930664,
"adv/std_reasoning": 0.33124879002571106,
"adv/std_step_conf": 0.28665006160736084,
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.7,
"calib/avg_num_step_conf": 0.12890625,
"calib/ece": 0.46125,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.875,
"calib/gap": 0.20333333333333325,
"calib/mean_conf": 0.8362499999999999,
"calib/mu_c": 0.9633333333333333,
"calib/mu_w": 0.76,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.06640625,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.46125,
"calib/std_conf": 0.3165808546011587,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 600.72265625,
"completions/mean_terminated_length": 657.2008666992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.038836970925331116,
"learning_rate": 3.861111111111112e-06,
"loss": -0.3511,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.00039035986992530525,
"mask/share_reasoning": 0.9121092557907104,
"mask/share_step_conf": 0.0015629110857844353,
"num_tokens": 16661745.0,
"reward": 0.029278146103024483,
"reward_std": 0.06075853109359741,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.00908671785145998,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.011044181883335114,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.03828040510416031,
"adv/mean_abs_reasoning": 0.03864803910255432,
"adv/mean_abs_step_conf": 0.0386255644261837,
"adv/ratio_final_to_reasoning": 0.9904876416260479,
"adv/ratio_step_to_reasoning": 0.9994184782231517,
"adv/std_final_conf": 0.23195363581180573,
"adv/std_reasoning": 0.23417921364307404,
"adv/std_step_conf": 0.23404304683208466,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.06640625,
"calib/ece": 0.9480000000000001,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.8,
"calib/mean_conf": 0.9480000000000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.9480000000000001,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.9480000000000001,
"calib/std_conf": 0.023999999999999973,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 687.5703125,
"completions/mean_terminated_length": 730.3651733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.039971452206373215,
"learning_rate": 3.833333333333334e-06,
"loss": -0.2889,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.00042069313349202275,
"mask/share_reasoning": 0.9400171041488647,
"mask/share_step_conf": 0.0009684442775323987,
"num_tokens": 16944843.0,
"reward": 0.006424275226891041,
"reward_std": 0.018170595169067383,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0010484375525265932,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.004511831793934107,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.09742030501365662,
"adv/mean_abs_reasoning": 0.11756224930286407,
"adv/mean_abs_step_conf": 0.09357871860265732,
"adv/ratio_final_to_reasoning": 0.8286699649875042,
"adv/ratio_step_to_reasoning": 0.7959929242386274,
"adv/std_final_conf": 0.32832229137420654,
"adv/std_reasoning": 0.3703429102897644,
"adv/std_step_conf": 0.33102813363075256,
"calib/answer_extract_rate": 0.0546875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.16796875,
"calib/ece": 0.45999999999999996,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.96,
"calib/mu_c": 0.96,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.45999999999999996,
"calib/std_conf": 0.004082482904638634,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 640.94140625,
"completions/mean_terminated_length": 689.416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0672,
"grad_norm": 0.06894773989915848,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.5592,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0006614683079533279,
"mask/share_reasoning": 0.926390528678894,
"mask/share_step_conf": 0.002635482233017683,
"num_tokens": 17217564.0,
"reward": 0.061490196734666824,
"reward_std": 0.0757787898182869,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.021030467003583908,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.026527659967541695,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.07636594772338867,
"adv/mean_abs_reasoning": 0.07729607820510864,
"adv/mean_abs_step_conf": 0.07724502682685852,
"adv/ratio_final_to_reasoning": 0.9879666536347184,
"adv/ratio_step_to_reasoning": 0.999339534690044,
"adv/std_final_conf": 0.327199786901474,
"adv/std_reasoning": 0.3311794102191925,
"adv/std_step_conf": 0.3309606909751892,
"calib/answer_extract_rate": 0.046875,
"calib/avg_num_step_conf": 0.09765625,
"calib/ece": 0.93,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.8,
"calib/mean_conf": 0.93,
"calib/mu_c": NaN,
"calib/mu_w": 0.93,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.93,
"calib/std_conf": 0.06511528238439879,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2954.0,
"completions/max_terminated_length": 2954.0,
"completions/mean_length": 587.94140625,
"completions/mean_terminated_length": 637.7669677734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.09593202918767929,
"learning_rate": 3.777777777777778e-06,
"loss": -0.561,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.00035268941428512335,
"mask/share_reasoning": 0.9200137853622437,
"mask/share_step_conf": 0.0015085700433701277,
"num_tokens": 17471853.0,
"reward": 0.012422004714608192,
"reward_std": 0.03513474017381668,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0022496094461530447,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.008094205521047115,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.20421713590621948,
"adv/mean_abs_reasoning": 0.22086596488952637,
"adv/mean_abs_step_conf": 0.22406552731990814,
"adv/ratio_final_to_reasoning": 0.9246202148364762,
"adv/ratio_step_to_reasoning": 1.0144864439932253,
"adv/std_final_conf": 0.5198385119438171,
"adv/std_reasoning": 0.5237278342247009,
"adv/std_step_conf": 0.5231757164001465,
"calib/answer_extract_rate": 0.0859375,
"calib/auroc": 0.5882352941176471,
"calib/avg_num_step_conf": 0.3125,
"calib/ece": 0.7566000000000002,
"calib/final_conf_rate": 0.078125,
"calib/format_rate": 0.0625,
"calib/frac_conf_gt_0.9": 0.85,
"calib/gap": 0.05105882352941182,
"calib/mean_conf": 0.9066000000000001,
"calib/mu_c": 0.9500000000000001,
"calib/mu_w": 0.8989411764705882,
"calib/nonempty_final_conf_rate": 0.078125,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.078125,
"calib/pce": 0.7566000000000002,
"calib/std_conf": 0.14892159010700898,
"calib/step_conf_rate": 0.078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 584.03125,
"completions/mean_terminated_length": 615.2756958007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.10870470106601715,
"learning_rate": 3.7500000000000005e-06,
"loss": -1.1016,
"mask/has_final_conf_rate": 0.078125,
"mask/share_final_conf": 0.0013415388530120254,
"mask/share_reasoning": 0.9434852600097656,
"mask/share_step_conf": 0.0043919761665165424,
"num_tokens": 17726389.0,
"reward": 0.06128288805484772,
"reward_std": 0.13415977358818054,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.017960937693715096,
"rewards/format_reward_step": 0.0625,
"rewards/stepwise_brier_reward": 0.02764780819416046,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.10111625492572784,
"adv/mean_abs_reasoning": 0.1084204912185669,
"adv/mean_abs_step_conf": 0.10549107193946838,
"adv/ratio_final_to_reasoning": 0.93263048146393,
"adv/ratio_step_to_reasoning": 0.9729809444121311,
"adv/std_final_conf": 0.3663294315338135,
"adv/std_reasoning": 0.37030887603759766,
"adv/std_step_conf": 0.3700118362903595,
"calib/answer_extract_rate": 0.07421875,
"calib/auroc": 0.4545454545454546,
"calib/avg_num_step_conf": 0.171875,
"calib/ece": 0.7338461538461541,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.9230769230769231,
"calib/gap": 0.08545454545454534,
"calib/mean_conf": 0.8876923076923079,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8745454545454546,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.7338461538461541,
"calib/std_conf": 0.2562797103573133,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2277.0,
"completions/max_terminated_length": 2277.0,
"completions/mean_length": 519.078125,
"completions/mean_terminated_length": 570.3175659179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.0704,
"grad_norm": 0.05091705545783043,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.6861,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0012055350234732032,
"mask/share_reasoning": 0.9070212841033936,
"mask/share_step_conf": 0.0019294099183753133,
"num_tokens": 17965625.0,
"reward": 0.029108598828315735,
"reward_std": 0.05871621146798134,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.005662109702825546,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.012417396530508995,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.15815921127796173,
"adv/mean_abs_reasoning": 0.1605871617794037,
"adv/mean_abs_step_conf": 0.16047969460487366,
"adv/ratio_final_to_reasoning": 0.9848807932431286,
"adv/ratio_step_to_reasoning": 0.9993307860146525,
"adv/std_final_conf": 0.4613058865070343,
"adv/std_reasoning": 0.4684103727340698,
"adv/std_step_conf": 0.4680980443954468,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.42500000000000004,
"calib/avg_num_step_conf": 0.15625,
"calib/ece": 0.7958666666666669,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.015040000000000164,
"calib/mean_conf": 0.9625333333333336,
"calib/mu_c": 0.95,
"calib/mu_w": 0.9650400000000001,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.7958666666666669,
"calib/std_conf": 0.016449788921307013,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2982.0,
"completions/max_terminated_length": 2982.0,
"completions/mean_length": 598.203125,
"completions/mean_terminated_length": 651.6595458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.09449908882379532,
"learning_rate": 3.694444444444445e-06,
"loss": -0.9013,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0004775570996571332,
"mask/share_reasoning": 0.9159537553787231,
"mask/share_step_conf": 0.0015373954083770514,
"num_tokens": 18223773.0,
"reward": 0.03755014389753342,
"reward_std": 0.10067607462406158,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.009698078036308289,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.01946999505162239,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.2543319761753082,
"adv/mean_abs_reasoning": 0.2732967436313629,
"adv/mean_abs_step_conf": 0.2662087380886078,
"adv/ratio_final_to_reasoning": 0.9306074152071298,
"adv/ratio_step_to_reasoning": 0.9740648005952248,
"adv/std_final_conf": 0.5624877214431763,
"adv/std_reasoning": 0.5737119317054749,
"adv/std_step_conf": 0.57329261302948,
"calib/answer_extract_rate": 0.1328125,
"calib/auroc": 0.28985507246376807,
"calib/avg_num_step_conf": 0.40625,
"calib/ece": 0.8156538461538463,
"calib/final_conf_rate": 0.1015625,
"calib/format_rate": 0.078125,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": -0.2426231884057971,
"calib/mean_conf": 0.8579615384615384,
"calib/mu_c": 0.6433333333333333,
"calib/mu_w": 0.8859565217391304,
"calib/nonempty_final_conf_rate": 0.1015625,
"calib/nonempty_reasoning_rate": 0.1484375,
"calib/nonempty_step_conf_rate": 0.09765625,
"calib/pce": 0.7791153846153848,
"calib/std_conf": 0.2582456315468235,
"calib/step_conf_rate": 0.09765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 600.4140625,
"completions/mean_terminated_length": 656.86328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.15409648418426514,
"learning_rate": 3.6666666666666666e-06,
"loss": -1.4648,
"mask/has_final_conf_rate": 0.1015625,
"mask/share_final_conf": 0.002301223110407591,
"mask/share_reasoning": 0.9055416584014893,
"mask/share_step_conf": 0.006219647824764252,
"num_tokens": 18481567.0,
"reward": 0.07864746451377869,
"reward_std": 0.15497872233390808,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.017530042678117752,
"rewards/format_reward_step": 0.078125,
"rewards/stepwise_brier_reward": 0.03915491700172424,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.07668646425008774,
"adv/mean_abs_reasoning": 0.07989902049303055,
"adv/mean_abs_step_conf": 0.08253868669271469,
"adv/ratio_final_to_reasoning": 0.9597922950354186,
"adv/ratio_step_to_reasoning": 1.0330375289133162,
"adv/std_final_conf": 0.32882875204086304,
"adv/std_reasoning": 0.33124199509620667,
"adv/std_step_conf": 0.33112096786499023,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.06640625,
"calib/ece": 0.6283333333333332,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0050000000000000044,
"calib/mean_conf": 0.9616666666666666,
"calib/mu_c": 0.965,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.6283333333333332,
"calib/std_conf": 0.0037267799624996524,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3065.0,
"completions/max_terminated_length": 3065.0,
"completions/mean_length": 669.1171875,
"completions/mean_terminated_length": 716.7113037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0736,
"grad_norm": 0.07533657550811768,
"learning_rate": 3.638888888888889e-06,
"loss": -0.607,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0002714511356316507,
"mask/share_reasoning": 0.9325519800186157,
"mask/share_step_conf": 0.000770289683714509,
"num_tokens": 18757357.0,
"reward": 0.027300164103507996,
"reward_std": 0.06720856577157974,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.008721483871340752,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.015083336271345615,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.1340787559747696,
"adv/mean_abs_reasoning": 0.15459944307804108,
"adv/mean_abs_step_conf": 0.1544829159975052,
"adv/ratio_final_to_reasoning": 0.8672654526128355,
"adv/ratio_step_to_reasoning": 0.9992462645516965,
"adv/std_final_conf": 0.4342644512653351,
"adv/std_reasoning": 0.46838048100471497,
"adv/std_step_conf": 0.4680275321006775,
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.5555555555555556,
"calib/avg_num_step_conf": 0.15625,
"calib/ece": 0.8126184,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.05264622222222204,
"calib/mean_conf": 0.9126183999999998,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9073537777777779,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.8126184,
"calib/std_conf": 0.13736768239669767,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2973.0,
"completions/max_terminated_length": 2973.0,
"completions/mean_length": 617.078125,
"completions/mean_terminated_length": 660.970703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.16354045271873474,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.8991,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0004902217769995332,
"mask/share_reasoning": 0.9311144351959229,
"mask/share_step_conf": 0.0019891324918717146,
"num_tokens": 19022321.0,
"reward": 0.03013056144118309,
"reward_std": 0.08522209525108337,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00877256877720356,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.016812337562441826,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.18683239817619324,
"adv/mean_abs_reasoning": 0.188472718000412,
"adv/mean_abs_step_conf": 0.19015845656394958,
"adv/ratio_final_to_reasoning": 0.9912967784323291,
"adv/ratio_step_to_reasoning": 1.0089442046648571,
"adv/std_final_conf": 0.49154990911483765,
"adv/std_reasoning": 0.49680474400520325,
"adv/std_step_conf": 0.49653515219688416,
"calib/answer_extract_rate": 0.07421875,
"calib/auroc": 0.2142857142857143,
"calib/avg_num_step_conf": 0.21484375,
"calib/ece": 0.7658,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.12592857142857117,
"calib/mean_conf": 0.8324666666666668,
"calib/mu_c": 0.95,
"calib/mu_w": 0.8240714285714288,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.0859375,
"calib/nonempty_step_conf_rate": 0.0625,
"calib/pce": 0.7658,
"calib/std_conf": 0.29652180732995376,
"calib/step_conf_rate": 0.0625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3052.0,
"completions/max_terminated_length": 3052.0,
"completions/mean_length": 595.44921875,
"completions/mean_terminated_length": 648.6595458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.11147620528936386,
"learning_rate": 3.5833333333333335e-06,
"loss": -1.0824,
"mask/has_final_conf_rate": 0.05859375,
"mask/share_final_conf": 0.0007515978068113327,
"mask/share_reasoning": 0.9147189855575562,
"mask/share_step_conf": 0.0024981689639389515,
"num_tokens": 19279164.0,
"reward": 0.043333299458026886,
"reward_std": 0.10258567333221436,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.01187230832874775,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.026042941957712173,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.21666905283927917,
"adv/mean_abs_reasoning": 0.23212051391601562,
"adv/mean_abs_step_conf": 0.23053902387619019,
"adv/ratio_final_to_reasoning": 0.9334334530970106,
"adv/ratio_step_to_reasoning": 0.9931867717628885,
"adv/std_final_conf": 0.5434837341308594,
"adv/std_reasoning": 0.5492792129516602,
"adv/std_step_conf": 0.5488502383232117,
"calib/answer_extract_rate": 0.1015625,
"calib/auroc": 0.703125,
"calib/avg_num_step_conf": 0.3125,
"calib/ece": 0.7449999999999998,
"calib/final_conf_rate": 0.078125,
"calib/format_rate": 0.0625,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.02499999999999991,
"calib/mean_conf": 0.9449999999999997,
"calib/mu_c": 0.9649999999999999,
"calib/mu_w": 0.94,
"calib/nonempty_final_conf_rate": 0.078125,
"calib/nonempty_reasoning_rate": 0.11328125,
"calib/nonempty_step_conf_rate": 0.078125,
"calib/pce": 0.7449999999999998,
"calib/std_conf": 0.05843800133474792,
"calib/step_conf_rate": 0.078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 536.8046875,
"completions/mean_terminated_length": 577.4033813476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0768,
"grad_norm": 0.10440513491630554,
"learning_rate": 3.555555555555556e-06,
"loss": -1.3869,
"mask/has_final_conf_rate": 0.078125,
"mask/share_final_conf": 0.001047117868438363,
"mask/share_reasoning": 0.9244768619537354,
"mask/share_step_conf": 0.004163539037108421,
"num_tokens": 19520994.0,
"reward": 0.06722670793533325,
"reward_std": 0.1429571509361267,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.015968751162290573,
"rewards/format_reward_step": 0.0625,
"rewards/stepwise_brier_reward": 0.03271903842687607,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.06860332190990448,
"adv/mean_abs_reasoning": 0.09225049614906311,
"adv/mean_abs_step_conf": 0.072090283036232,
"adv/ratio_final_to_reasoning": 0.7436634465255525,
"adv/ratio_step_to_reasoning": 0.7814622798314796,
"adv/std_final_conf": 0.2840113639831543,
"adv/std_reasoning": 0.33122873306274414,
"adv/std_step_conf": 0.2866162061691284,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.5833333333333333,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.8088228571428571,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.009706666666666752,
"calib/mean_conf": 0.9516799999999999,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9502933333333332,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.8088228571428571,
"calib/std_conf": 0.020379754659956027,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2962.0,
"completions/max_terminated_length": 2962.0,
"completions/mean_length": 542.765625,
"completions/mean_terminated_length": 588.7626953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.08159588277339935,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.5564,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0004280888824723661,
"mask/share_reasoning": 0.9201780557632446,
"mask/share_step_conf": 0.001268864842131734,
"num_tokens": 19766974.0,
"reward": 0.02091187797486782,
"reward_std": 0.041246477514505386,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0022610502783209085,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.009443232789635658,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.12885695695877075,
"adv/mean_abs_reasoning": 0.133488729596138,
"adv/mean_abs_step_conf": 0.12991607189178467,
"adv/ratio_final_to_reasoning": 0.965302144597672,
"adv/ratio_step_to_reasoning": 0.9732362596066185,
"adv/std_final_conf": 0.4013332724571228,
"adv/std_reasoning": 0.4056570529937744,
"adv/std_step_conf": 0.40540289878845215,
"calib/answer_extract_rate": 0.0703125,
"calib/auroc": 0.5454545454545454,
"calib/avg_num_step_conf": 0.15234375,
"calib/ece": 0.8733333333333333,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0036363636363636598,
"calib/mean_conf": 0.9566666666666667,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9563636363636363,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.08203125,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.8733333333333333,
"calib/std_conf": 0.010274023338281613,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2922.0,
"completions/max_terminated_length": 2922.0,
"completions/mean_length": 643.421875,
"completions/mean_terminated_length": 683.4689331054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.09621241688728333,
"learning_rate": 3.5e-06,
"loss": -0.984,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0008580433786846697,
"mask/share_reasoning": 0.9374440312385559,
"mask/share_step_conf": 0.0031041507609188557,
"num_tokens": 20035618.0,
"reward": 0.036248065531253815,
"reward_std": 0.07528281211853027,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.006729297339916229,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.022256478667259216,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.1569603532552719,
"adv/mean_abs_reasoning": 0.18250501155853271,
"adv/mean_abs_step_conf": 0.1614481508731842,
"adv/ratio_final_to_reasoning": 0.860033113145125,
"adv/ratio_step_to_reasoning": 0.8846231097681655,
"adv/std_final_conf": 0.465222030878067,
"adv/std_reasoning": 0.49683383107185364,
"adv/std_step_conf": 0.4681553244590759,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.6481481481481481,
"calib/avg_num_step_conf": 0.20703125,
"calib/ece": 0.6158333333333333,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.13888888888888906,
"calib/mean_conf": 0.8658333333333333,
"calib/mu_c": 0.9700000000000001,
"calib/mu_w": 0.831111111111111,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.0625,
"calib/pce": 0.6158333333333333,
"calib/std_conf": 0.22343747571872435,
"calib/step_conf_rate": 0.0625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 576.109375,
"completions/mean_terminated_length": 611.9668579101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.08,
"grad_norm": 0.07777323573827744,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.8569,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0005481348489411175,
"mask/share_reasoning": 0.9385069608688354,
"mask/share_step_conf": 0.0023511142935603857,
"num_tokens": 20287854.0,
"reward": 0.04728742688894272,
"reward_std": 0.11980638653039932,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.016207030043005943,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.023971345275640488,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.16909301280975342,
"adv/mean_abs_reasoning": 0.1740565448999405,
"adv/mean_abs_step_conf": 0.17162185907363892,
"adv/ratio_final_to_reasoning": 0.9714832206221234,
"adv/ratio_step_to_reasoning": 0.9860120983804361,
"adv/std_final_conf": 0.4644908905029297,
"adv/std_reasoning": 0.4684191942214966,
"adv/std_step_conf": 0.4681659936904907,
"calib/answer_extract_rate": 0.0859375,
"calib/auroc": 0.7833333333333333,
"calib/avg_num_step_conf": 0.20703125,
"calib/ece": 0.6052941176470588,
"calib/final_conf_rate": 0.06640625,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.8235294117647058,
"calib/gap": 0.0858333333333331,
"calib/mean_conf": 0.8994117647058824,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8741666666666669,
"calib/nonempty_final_conf_rate": 0.06640625,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.06640625,
"calib/pce": 0.6052941176470588,
"calib/std_conf": 0.17962282667186732,
"calib/step_conf_rate": 0.06640625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 599.14453125,
"completions/mean_terminated_length": 644.4580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.14026468992233276,
"learning_rate": 3.444444444444445e-06,
"loss": -0.8141,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.0010099567007273436,
"mask/share_reasoning": 0.924968957901001,
"mask/share_step_conf": 0.003708635224029422,
"num_tokens": 20544291.0,
"reward": 0.07288633286952972,
"reward_std": 0.1086694523692131,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.024304687976837158,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.035964079201221466,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.12345867604017258,
"adv/mean_abs_reasoning": 0.1375705897808075,
"adv/mean_abs_step_conf": 0.1388530731201172,
"adv/ratio_final_to_reasoning": 0.8974205623228078,
"adv/ratio_step_to_reasoning": 1.0093223656404546,
"adv/std_final_conf": 0.4033997654914856,
"adv/std_reasoning": 0.4056600034236908,
"adv/std_step_conf": 0.4053976833820343,
"calib/answer_extract_rate": 0.06640625,
"calib/auroc": 0.6818181818181819,
"calib/avg_num_step_conf": 0.26171875,
"calib/ece": 0.7975000000000001,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": 0.08636363636363642,
"calib/mean_conf": 0.8808333333333334,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8736363636363635,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.078125,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.7975000000000001,
"calib/std_conf": 0.18918502113598268,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 581.2734375,
"completions/mean_terminated_length": 617.4523315429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.11724759638309479,
"learning_rate": 3.416666666666667e-06,
"loss": -0.796,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.00063959916587919,
"mask/share_reasoning": 0.9372695684432983,
"mask/share_step_conf": 0.0034970752894878387,
"num_tokens": 20797761.0,
"reward": 0.04430808871984482,
"reward_std": 0.07722712308168411,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.012422265484929085,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.02771754562854767,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.18359941244125366,
"adv/mean_abs_reasoning": 0.188472718000412,
"adv/mean_abs_step_conf": 0.18686653673648834,
"adv/ratio_final_to_reasoning": 0.9741431777985625,
"adv/ratio_step_to_reasoning": 0.9914779110686983,
"adv/std_final_conf": 0.4926977753639221,
"adv/std_reasoning": 0.49680477380752563,
"adv/std_step_conf": 0.49635177850723267,
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.5833333333333334,
"calib/avg_num_step_conf": 0.25390625,
"calib/ece": 0.8392307692307694,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.047499999999999876,
"calib/mean_conf": 0.9161538461538463,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9125000000000001,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.0625,
"calib/pce": 0.8392307692307694,
"calib/std_conf": 0.12219394807741323,
"calib/step_conf_rate": 0.0625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3005.0,
"completions/max_terminated_length": 3005.0,
"completions/mean_length": 650.0625,
"completions/mean_terminated_length": 693.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0832,
"grad_norm": 0.1859666258096695,
"learning_rate": 3.3888888888888893e-06,
"loss": -1.4194,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0006620531203225255,
"mask/share_reasoning": 0.9336406588554382,
"mask/share_step_conf": 0.003197286743670702,
"num_tokens": 21072201.0,
"reward": 0.040519773960113525,
"reward_std": 0.09668624401092529,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.010687890462577343,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.02100810408592224,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.13988946378231049,
"adv/mean_abs_reasoning": 0.14555124938488007,
"adv/mean_abs_step_conf": 0.1389458030462265,
"adv/ratio_final_to_reasoning": 0.9611010855180078,
"adv/ratio_step_to_reasoning": 0.9546177283495051,
"adv/std_final_conf": 0.4337327182292938,
"adv/std_reasoning": 0.4381541311740875,
"adv/std_step_conf": 0.4375132620334625,
"calib/answer_extract_rate": 0.07421875,
"calib/auroc": 0.6875,
"calib/avg_num_step_conf": 0.10546875,
"calib/ece": 0.660609090909091,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.041245833333333315,
"calib/mean_conf": 0.9333363636363636,
"calib/mu_c": 0.9633333333333333,
"calib/mu_w": 0.9220875,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.078125,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.660609090909091,
"calib/std_conf": 0.08464065510065343,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2999.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 602.8046875,
"completions/mean_terminated_length": 651.1307983398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.13712556660175323,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.9517,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.000740687595680356,
"mask/share_reasoning": 0.9237175583839417,
"mask/share_step_conf": 0.0013229991309344769,
"num_tokens": 21332895.0,
"reward": 0.038617219775915146,
"reward_std": 0.08786404132843018,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.011576997116208076,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.01675843819975853,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.17335572838783264,
"adv/mean_abs_reasoning": 0.22115306556224823,
"adv/mean_abs_step_conf": 0.20430010557174683,
"adv/ratio_final_to_reasoning": 0.7838721473161673,
"adv/ratio_step_to_reasoning": 0.9237950423718735,
"adv/std_final_conf": 0.4925078749656677,
"adv/std_reasoning": 0.5492574572563171,
"adv/std_step_conf": 0.5232893824577332,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.6309523809523809,
"calib/avg_num_step_conf": 0.19140625,
"calib/ece": 0.7464705882352941,
"calib/final_conf_rate": 0.06640625,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.8823529411764706,
"calib/gap": 0.061190476190475906,
"calib/mean_conf": 0.9229411764705882,
"calib/mu_c": 0.9733333333333333,
"calib/mu_w": 0.9121428571428574,
"calib/nonempty_final_conf_rate": 0.06640625,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.7464705882352941,
"calib/std_conf": 0.13714589508910713,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 586.4296875,
"completions/mean_terminated_length": 612.7591552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.1025858148932457,
"learning_rate": 3.3333333333333333e-06,
"loss": -1.0038,
"mask/has_final_conf_rate": 0.06640625,
"mask/share_final_conf": 0.0011797649785876274,
"mask/share_reasoning": 0.9526797533035278,
"mask/share_step_conf": 0.003171744290739298,
"num_tokens": 21585181.0,
"reward": 0.05320657789707184,
"reward_std": 0.13630428910255432,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.014562109485268593,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.028819594532251358,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.07615146785974503,
"adv/mean_abs_reasoning": 0.09662556648254395,
"adv/mean_abs_step_conf": 0.07724253088235855,
"adv/ratio_final_to_reasoning": 0.788108889105476,
"adv/ratio_step_to_reasoning": 0.7994005488838498,
"adv/std_final_conf": 0.32627594470977783,
"adv/std_reasoning": 0.3702907860279083,
"adv/std_step_conf": 0.330949991941452,
"calib/answer_extract_rate": 0.0390625,
"calib/auroc": 0.9,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.7966666666666666,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.008000000000000007,
"calib/mean_conf": 0.9633333333333333,
"calib/mu_c": 0.97,
"calib/mu_w": 0.962,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.7966666666666666,
"calib/std_conf": 0.004714045207910321,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2901.0,
"completions/max_terminated_length": 2901.0,
"completions/mean_length": 640.1875,
"completions/mean_terminated_length": 674.4362182617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0864,
"grad_norm": 0.04565536603331566,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.3859,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.00029859773349016905,
"mask/share_reasoning": 0.9474666714668274,
"mask/share_step_conf": 0.0014534371439367533,
"num_tokens": 21855317.0,
"reward": 0.01604899764060974,
"reward_std": 0.04539342224597931,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0011496094521135092,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.00808569323271513,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.1454237550497055,
"adv/mean_abs_reasoning": 0.14722901582717896,
"adv/mean_abs_step_conf": 0.14664122462272644,
"adv/ratio_final_to_reasoning": 0.9877384171365208,
"adv/ratio_step_to_reasoning": 0.9960076401981626,
"adv/std_final_conf": 0.4333024024963379,
"adv/std_reasoning": 0.4381259083747864,
"adv/std_step_conf": 0.4377772808074951,
"calib/answer_extract_rate": 0.0703125,
"calib/avg_num_step_conf": 0.14453125,
"calib/ece": 0.8723784615384617,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/mean_conf": 0.8723784615384618,
"calib/mu_c": NaN,
"calib/mu_w": 0.8723784615384618,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.8723784615384617,
"calib/std_conf": 0.2538613953093348,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2981.0,
"completions/max_terminated_length": 2981.0,
"completions/mean_length": 563.3671875,
"completions/mean_terminated_length": 600.925048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.07771904021501541,
"learning_rate": 3.277777777777778e-06,
"loss": -0.8274,
"mask/has_final_conf_rate": 0.05078125,
"mask/share_final_conf": 0.0006377262761816382,
"mask/share_reasoning": 0.9350779056549072,
"mask/share_step_conf": 0.0017843758687376976,
"num_tokens": 22105091.0,
"reward": 0.026922065764665604,
"reward_std": 0.063670314848423,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0033345031552016735,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.017020627856254578,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.07634367048740387,
"adv/mean_abs_reasoning": 0.07730336487293243,
"adv/mean_abs_step_conf": 0.07725274562835693,
"adv/ratio_final_to_reasoning": 0.9875853478421532,
"adv/ratio_step_to_reasoning": 0.9993451870476957,
"adv/std_final_conf": 0.32711878418922424,
"adv/std_reasoning": 0.3312106430530548,
"adv/std_step_conf": 0.33099377155303955,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.7,
"calib/avg_num_step_conf": 0.16796875,
"calib/ece": 0.7183333333333333,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.1020000000000002,
"calib/mean_conf": 0.8849999999999998,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8679999999999998,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.7183333333333333,
"calib/std_conf": 0.1359840676942217,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2954.0,
"completions/max_terminated_length": 2954.0,
"completions/mean_length": 591.35546875,
"completions/mean_terminated_length": 641.4703369140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.10863806307315826,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.4169,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0002931926283054054,
"mask/share_reasoning": 0.9194286465644836,
"mask/share_step_conf": 0.002153177745640278,
"num_tokens": 22363742.0,
"reward": 0.017213165760040283,
"reward_std": 0.048686183989048004,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.005438281688839197,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.008269689977169037,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.1318395733833313,
"adv/mean_abs_reasoning": 0.1352827101945877,
"adv/mean_abs_step_conf": 0.13516740500926971,
"adv/ratio_final_to_reasoning": 0.9745485819562317,
"adv/ratio_step_to_reasoning": 0.9991476724176198,
"adv/std_final_conf": 0.4273732602596283,
"adv/std_reasoning": 0.4381563663482666,
"adv/std_step_conf": 0.43778303265571594,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.625,
"calib/avg_num_step_conf": 0.11328125,
"calib/ece": 0.7588087960118102,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.007739004985237163,
"calib/mean_conf": 0.9588087960118102,
"calib/mu_c": 0.965,
"calib/mu_w": 0.9572609950147628,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.7588087960118102,
"calib/std_conf": 0.022773163829472608,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 582.21875,
"completions/mean_terminated_length": 605.8861694335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0896,
"grad_norm": 0.13366936147212982,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.7068,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0006747072329744697,
"mask/share_reasoning": 0.9586551189422607,
"mask/share_step_conf": 0.001607641694135964,
"num_tokens": 22618710.0,
"reward": 0.03124072588980198,
"reward_std": 0.08836211264133453,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0091178547590971,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.014953775331377983,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.12445656955242157,
"adv/mean_abs_reasoning": 0.13518846035003662,
"adv/mean_abs_step_conf": 0.13008637726306915,
"adv/ratio_final_to_reasoning": 0.9206153338100936,
"adv/ratio_step_to_reasoning": 0.9622594778152151,
"adv/std_final_conf": 0.4016876816749573,
"adv/std_reasoning": 0.4056515097618103,
"adv/std_step_conf": 0.4053715169429779,
"calib/answer_extract_rate": 0.05859375,
"calib/avg_num_step_conf": 0.15234375,
"calib/ece": 0.88,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.7777777777777778,
"calib/mean_conf": 0.88,
"calib/mu_c": NaN,
"calib/mu_w": 0.88,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.88,
"calib/std_conf": 0.1567021236472421,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 534.28515625,
"completions/mean_terminated_length": 589.5560302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.11454667896032333,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.8361,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0006141673075035214,
"mask/share_reasoning": 0.9030116200447083,
"mask/share_step_conf": 0.0026242309249937534,
"num_tokens": 22863311.0,
"reward": 0.03183240070939064,
"reward_std": 0.0626763254404068,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0070679690688848495,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.01716206967830658,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.10018318891525269,
"adv/mean_abs_reasoning": 0.10592220723628998,
"adv/mean_abs_step_conf": 0.10617843270301819,
"adv/ratio_final_to_reasoning": 0.9458185542882923,
"adv/ratio_step_to_reasoning": 1.002418996671365,
"adv/std_final_conf": 0.36842837929725647,
"adv/std_reasoning": 0.3703352212905884,
"adv/std_step_conf": 0.37000808119773865,
"calib/answer_extract_rate": 0.06640625,
"calib/auroc": 0.7142857142857143,
"calib/avg_num_step_conf": 0.15625,
"calib/ece": 0.7266666666666666,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.7777777777777778,
"calib/gap": 0.020714285714285685,
"calib/mean_conf": 0.9488888888888888,
"calib/mu_c": 0.965,
"calib/mu_w": 0.9442857142857143,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.7266666666666666,
"calib/std_conf": 0.03142696805273543,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2970.0,
"completions/max_terminated_length": 2970.0,
"completions/mean_length": 593.3984375,
"completions/mean_terminated_length": 632.9583740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.14420361816883087,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.7468,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0007306640036404133,
"mask/share_reasoning": 0.9340777397155762,
"mask/share_step_conf": 0.0026915583293884993,
"num_tokens": 23120733.0,
"reward": 0.03548293933272362,
"reward_std": 0.07222367823123932,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.01073281280696392,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.014818215742707253,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.0819677859544754,
"adv/mean_abs_reasoning": 0.08327651768922806,
"adv/mean_abs_step_conf": 0.07941487431526184,
"adv/ratio_final_to_reasoning": 0.9842845045510117,
"adv/ratio_step_to_reasoning": 0.9536286641046023,
"adv/std_final_conf": 0.32665571570396423,
"adv/std_reasoning": 0.33119046688079834,
"adv/std_step_conf": 0.33089470863342285,
"calib/answer_extract_rate": 0.0625,
"calib/avg_num_step_conf": 0.14453125,
"calib/ece": 0.9466666666666667,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.8888888888888888,
"calib/mean_conf": 0.9466666666666667,
"calib/mu_c": NaN,
"calib/mu_w": 0.9466666666666667,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.0703125,
"calib/nonempty_step_conf_rate": 0.03125,
"calib/pce": 0.9466666666666667,
"calib/std_conf": 0.025385910352879647,
"calib/step_conf_rate": 0.03125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2916.0,
"completions/max_terminated_length": 2916.0,
"completions/mean_length": 535.41015625,
"completions/mean_terminated_length": 575.9033813476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0928,
"grad_norm": 0.07893165200948715,
"learning_rate": 3.138888888888889e-06,
"loss": -0.6318,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0013570425799116492,
"mask/share_reasoning": 0.9254021644592285,
"mask/share_step_conf": 0.0029283100739121437,
"num_tokens": 23363294.0,
"reward": 0.013725357130169868,
"reward_std": 0.033733196556568146,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0015304686967283487,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.007154230028390884,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.12596651911735535,
"adv/mean_abs_reasoning": 0.12843841314315796,
"adv/mean_abs_step_conf": 0.12427614629268646,
"adv/ratio_final_to_reasoning": 0.9807542466049668,
"adv/ratio_step_to_reasoning": 0.9675932865518027,
"adv/std_final_conf": 0.4031435549259186,
"adv/std_reasoning": 0.40566515922546387,
"adv/std_step_conf": 0.40525007247924805,
"calib/answer_extract_rate": 0.06640625,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.11328125,
"calib/ece": 0.7727272727272726,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.1060000000000001,
"calib/mean_conf": 0.8636363636363636,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8539999999999999,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.078125,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.7727272727272726,
"calib/std_conf": 0.2735381883684027,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 623.82421875,
"completions/mean_terminated_length": 679.5701904296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.15347856283187866,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.8374,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0007594419876113534,
"mask/share_reasoning": 0.9156370162963867,
"mask/share_step_conf": 0.0015722834505140781,
"num_tokens": 23632841.0,
"reward": 0.0326203815639019,
"reward_std": 0.07294408977031708,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.010305078700184822,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.013213226571679115,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.056992340832948685,
"adv/mean_abs_reasoning": 0.09663102775812149,
"adv/mean_abs_step_conf": 0.057933270931243896,
"adv/ratio_final_to_reasoning": 0.5897933837111515,
"adv/ratio_step_to_reasoning": 0.5995307332988065,
"adv/std_final_conf": 0.28197088837623596,
"adv/std_reasoning": 0.3703117072582245,
"adv/std_step_conf": 0.2866179347038269,
"calib/answer_extract_rate": 0.0390625,
"calib/auroc": 0.7500000000000001,
"calib/avg_num_step_conf": 0.0546875,
"calib/ece": 0.7275992063492063,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.11613425925925924,
"calib/mean_conf": 0.8704563492063492,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8538657407407407,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.7275992063492063,
"calib/std_conf": 0.2331131178592595,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 630.01171875,
"completions/mean_terminated_length": 658.2979125976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.13982361555099487,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.4437,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0010558613575994968,
"mask/share_reasoning": 0.9537944793701172,
"mask/share_step_conf": 0.0021809404715895653,
"num_tokens": 23903012.0,
"reward": 0.017247222363948822,
"reward_std": 0.04878251254558563,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0008829089347273111,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.006709239911288023,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.038533568382263184,
"adv/mean_abs_reasoning": 0.05798481032252312,
"adv/mean_abs_step_conf": 0.03862389177083969,
"adv/ratio_final_to_reasoning": 0.6645459072459109,
"adv/ratio_step_to_reasoning": 0.6661036149985812,
"adv/std_final_conf": 0.23348626494407654,
"adv/std_reasoning": 0.28687289357185364,
"adv/std_step_conf": 0.23403292894363403,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.6875,
"calib/avg_num_step_conf": 0.109375,
"calib/ece": 0.46000000000000013,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.25749999999999995,
"calib/mean_conf": 0.7933333333333334,
"calib/mu_c": 0.965,
"calib/mu_w": 0.7075,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.46000000000000013,
"calib/std_conf": 0.35560589921365976,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2989.0,
"completions/max_terminated_length": 2989.0,
"completions/mean_length": 604.52734375,
"completions/mean_terminated_length": 644.8292236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.096,
"grad_norm": 0.11000526696443558,
"learning_rate": 3.055555555555556e-06,
"loss": -0.2259,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0015883477171882987,
"mask/share_reasoning": 0.9321364164352417,
"mask/share_step_conf": 0.0037752282805740833,
"num_tokens": 24161091.0,
"reward": 0.015415811911225319,
"reward_std": 0.04360250011086464,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.004642187617719173,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.005073030013591051,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.19097426533699036,
"adv/mean_abs_reasoning": 0.1932547688484192,
"adv/mean_abs_step_conf": 0.19305163621902466,
"adv/ratio_final_to_reasoning": 0.9881994968350946,
"adv/ratio_step_to_reasoning": 0.9989488868471139,
"adv/std_final_conf": 0.5175157785415649,
"adv/std_reasoning": 0.5236801505088806,
"adv/std_step_conf": 0.5231298208236694,
"calib/answer_extract_rate": 0.0703125,
"calib/auroc": 0.6875,
"calib/avg_num_step_conf": 0.140625,
"calib/ece": 0.802128427128427,
"calib/final_conf_rate": 0.0546875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.023350168350168343,
"calib/mean_conf": 0.9449855699855699,
"calib/mu_c": 0.965,
"calib/mu_w": 0.9416498316498316,
"calib/nonempty_final_conf_rate": 0.0546875,
"calib/nonempty_reasoning_rate": 0.08203125,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.802128427128427,
"calib/std_conf": 0.048943040131846964,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 616.54296875,
"completions/mean_terminated_length": 646.86474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.1311177760362625,
"learning_rate": 3.0277777777777776e-06,
"loss": -1.1402,
"mask/has_final_conf_rate": 0.0546875,
"mask/share_final_conf": 0.001614823006093502,
"mask/share_reasoning": 0.9477262496948242,
"mask/share_step_conf": 0.003783911233767867,
"num_tokens": 24426638.0,
"reward": 0.0390644446015358,
"reward_std": 0.11049094051122665,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.010537873953580856,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.018172457814216614,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.11584252864122391,
"adv/mean_abs_reasoning": 0.12452749907970428,
"adv/mean_abs_step_conf": 0.12619081139564514,
"adv/ratio_final_to_reasoning": 0.9302566059491685,
"adv/ratio_step_to_reasoning": 1.0133569880406597,
"adv/std_final_conf": 0.4036062955856323,
"adv/std_reasoning": 0.4056704044342041,
"adv/std_step_conf": 0.40536797046661377,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.8928571428571428,
"calib/avg_num_step_conf": 0.15234375,
"calib/ece": 0.5548472222222224,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.23881249999999998,
"calib/mean_conf": 0.7770694444444444,
"calib/mu_c": 0.9628125,
"calib/mu_w": 0.724,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.1015625,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.5548472222222224,
"calib/std_conf": 0.33366705747932857,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 520.86328125,
"completions/mean_terminated_length": 546.4794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.11405057460069656,
"learning_rate": 3e-06,
"loss": -0.8509,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0007968221325427294,
"mask/share_reasoning": 0.949183464050293,
"mask/share_step_conf": 0.0031446926295757294,
"num_tokens": 24666699.0,
"reward": 0.036815397441387177,
"reward_std": 0.08702380955219269,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.016938727349042892,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.01828642562031746,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.09549635648727417,
"adv/mean_abs_reasoning": 0.10031682252883911,
"adv/mean_abs_step_conf": 0.10555162280797958,
"adv/ratio_final_to_reasoning": 0.9519475804750579,
"adv/ratio_step_to_reasoning": 1.0521826763166824,
"adv/std_final_conf": 0.3664848506450653,
"adv/std_reasoning": 0.3703256845474243,
"adv/std_step_conf": 0.37017467617988586,
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.2777777777777778,
"calib/avg_num_step_conf": 0.12890625,
"calib/ece": 0.6905539772727272,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.08265624999999999,
"calib/mean_conf": 0.872372159090909,
"calib/mu_c": 0.94,
"calib/mu_w": 0.85734375,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.6905539772727272,
"calib/std_conf": 0.27654168850000704,
"calib/step_conf_rate": 0.05859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3008.0,
"completions/max_terminated_length": 3008.0,
"completions/mean_length": 682.49609375,
"completions/mean_terminated_length": 710.2398071289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0992,
"grad_norm": 0.08944234251976013,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.4989,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0006196058820933104,
"mask/share_reasoning": 0.9587451219558716,
"mask/share_step_conf": 0.001572765177115798,
"num_tokens": 24947194.0,
"reward": 0.03515958413481712,
"reward_std": 0.07899127155542374,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.009003906510770321,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.0228484645485878,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.1211947426199913,
"adv/mean_abs_reasoning": 0.12562128901481628,
"adv/mean_abs_step_conf": 0.12621799111366272,
"adv/ratio_final_to_reasoning": 0.964762768878268,
"adv/ratio_step_to_reasoning": 1.0047500077695912,
"adv/std_final_conf": 0.4039490818977356,
"adv/std_reasoning": 0.40567031502723694,
"adv/std_step_conf": 0.4053996205329895,
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.6944444444444444,
"calib/avg_num_step_conf": 0.1171875,
"calib/ece": 0.6869960907508896,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.03515625,
"calib/frac_conf_gt_0.9": 0.8181818181818182,
"calib/gap": 0.1175603335266906,
"calib/mean_conf": 0.8688142725690714,
"calib/mu_c": 0.9650000000000001,
"calib/mu_w": 0.8474396664733095,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.6869960907508896,
"calib/std_conf": 0.2026797791513814,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2825.0,
"completions/max_terminated_length": 2825.0,
"completions/mean_length": 542.26171875,
"completions/mean_terminated_length": 571.2716064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.12918737530708313,
"learning_rate": 2.944444444444445e-06,
"loss": -0.9216,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0013458357425406575,
"mask/share_reasoning": 0.9453219175338745,
"mask/share_step_conf": 0.002550976350903511,
"num_tokens": 25194693.0,
"reward": 0.040408436208963394,
"reward_std": 0.08953796327114105,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.015550907701253891,
"rewards/format_reward_step": 0.03515625,
"rewards/stepwise_brier_reward": 0.022260168567299843,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.10610508918762207,
"adv/mean_abs_reasoning": 0.1279323250055313,
"adv/mean_abs_step_conf": 0.1084851622581482,
"adv/ratio_final_to_reasoning": 0.8293845139063524,
"adv/ratio_step_to_reasoning": 0.847988670990367,
"adv/std_final_conf": 0.3683347702026367,
"adv/std_reasoning": 0.4057179093360901,
"adv/std_step_conf": 0.37018951773643494,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.28125,
"calib/avg_num_step_conf": 0.15234375,
"calib/ece": 0.5637790697674417,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": -0.17744186046511623,
"calib/mean_conf": 0.8637790697674419,
"calib/mu_c": 0.7750581395348837,
"calib/mu_w": 0.9524999999999999,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.46377906976744177,
"calib/std_conf": 0.16117473486730388,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2892.0,
"completions/max_terminated_length": 2892.0,
"completions/mean_length": 551.0625,
"completions/mean_terminated_length": 595.240478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.12042226642370224,
"learning_rate": 2.916666666666667e-06,
"loss": -0.678,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.001040589064359665,
"mask/share_reasoning": 0.9197874069213867,
"mask/share_step_conf": 0.004953281953930855,
"num_tokens": 25441893.0,
"reward": 0.04625809192657471,
"reward_std": 0.10904739797115326,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.015411455184221268,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.01840420439839363,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.17711566388607025,
"adv/mean_abs_reasoning": 0.22767430543899536,
"adv/mean_abs_step_conf": 0.196292906999588,
"adv/ratio_final_to_reasoning": 0.7779343547114843,
"adv/ratio_step_to_reasoning": 0.8621653928892038,
"adv/std_final_conf": 0.4906613528728485,
"adv/std_reasoning": 0.5492765307426453,
"adv/std_step_conf": 0.5232849717140198,
"calib/answer_extract_rate": 0.078125,
"calib/auroc": 0.3214285714285714,
"calib/avg_num_step_conf": 0.171875,
"calib/ece": 0.7791931115366915,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.9375,
"calib/gap": 0.04377930110206696,
"calib/mean_conf": 0.9041931115366915,
"calib/mu_c": 0.942500000001,
"calib/mu_w": 0.898720698898933,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.07421875,
"calib/pce": 0.7791931115366915,
"calib/std_conf": 0.22850792172641496,
"calib/step_conf_rate": 0.07421875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 594.328125,
"completions/mean_terminated_length": 626.1234130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1024,
"grad_norm": 0.14036454260349274,
"learning_rate": 2.888888888888889e-06,
"loss": -1.163,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.001665796386078,
"mask/share_reasoning": 0.9442043900489807,
"mask/share_step_conf": 0.0033485221210867167,
"num_tokens": 25699857.0,
"reward": 0.05740179494023323,
"reward_std": 0.13414423167705536,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.01009805966168642,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.027723312377929688,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.9375,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9375,
"calib/mu_c": NaN,
"calib/mu_w": 0.9375,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9375,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2839.0,
"completions/max_terminated_length": 2839.0,
"completions/mean_length": 566.63671875,
"completions/mean_terminated_length": 596.9506225585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.0,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 9.337649680674076e-05,
"mask/share_reasoning": 0.9488027691841125,
"mask/share_step_conf": 0.00032261834712699056,
"num_tokens": 25949988.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.03821493685245514,
"adv/mean_abs_reasoning": 0.07731429487466812,
"adv/mean_abs_step_conf": 0.038617927581071854,
"adv/ratio_final_to_reasoning": 0.49428035157540046,
"adv/ratio_step_to_reasoning": 0.4994927218009841,
"adv/std_final_conf": 0.2315683364868164,
"adv/std_reasoning": 0.3312574625015259,
"adv/std_step_conf": 0.23399679362773895,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.0859375,
"calib/ece": 0.3182323232323234,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.014848484848484889,
"calib/mean_conf": 0.98489898989899,
"calib/mu_c": 0.989848484848485,
"calib/mu_w": 0.9750000000000001,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.3182323232323234,
"calib/std_conf": 0.010848832829285312,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 631.0390625,
"completions/mean_terminated_length": 664.79833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.05403328686952591,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.2989,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0006696260534226894,
"mask/share_reasoning": 0.9450562000274658,
"mask/share_step_conf": 0.0034929136745631695,
"num_tokens": 26217718.0,
"reward": 0.018742192536592484,
"reward_std": 0.05301092937588692,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.004099121317267418,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.004184824880212545,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.038170330226421356,
"adv/mean_abs_reasoning": 0.057977523654699326,
"adv/mean_abs_step_conf": 0.0386352613568306,
"adv/ratio_final_to_reasoning": 0.6583642732613932,
"adv/ratio_step_to_reasoning": 0.6663834348450832,
"adv/std_final_conf": 0.23128490149974823,
"adv/std_reasoning": 0.2868368327617645,
"adv/std_step_conf": 0.234101802110672,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.06640625,
"calib/ece": 0.955204081632653,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.955204081632653,
"calib/mu_c": NaN,
"calib/mu_w": 0.955204081632653,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.955204081632653,
"calib/std_conf": 0.005204081632653068,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2828.0,
"completions/max_terminated_length": 2828.0,
"completions/mean_length": 580.92578125,
"completions/mean_terminated_length": 624.8613891601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1056,
"grad_norm": 0.055600475519895554,
"learning_rate": 2.805555555555556e-06,
"loss": -0.3092,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0002980417921207845,
"mask/share_reasoning": 0.9283925294876099,
"mask/share_step_conf": 0.0009968822123482823,
"num_tokens": 26472235.0,
"reward": 0.010449407622218132,
"reward_std": 0.029555387794971466,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0006840474670752883,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.004931790754199028,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.03767964243888855,
"adv/mean_abs_reasoning": 0.057977523654699326,
"adv/mean_abs_step_conf": 0.03859805315732956,
"adv/ratio_final_to_reasoning": 0.6499008592243394,
"adv/ratio_step_to_reasoning": 0.6657416654635097,
"adv/std_final_conf": 0.2283170223236084,
"adv/std_reasoning": 0.2868368327617645,
"adv/std_step_conf": 0.23387649655342102,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.9740130289711159,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9740130289711159,
"calib/mu_c": NaN,
"calib/mu_w": 0.9740130289711159,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.9740130289711159,
"calib/std_conf": 0.011860412005284073,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 586.30078125,
"completions/mean_terminated_length": 612.6244506835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.05185471847653389,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.2999,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0013168000150471926,
"mask/share_reasoning": 0.9537020921707153,
"mask/share_step_conf": 0.002012330573052168,
"num_tokens": 26729736.0,
"reward": 0.010003788396716118,
"reward_std": 0.028294987976551056,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00036270637065172195,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.004201224073767662,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.4219834728020131,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.4219834728020131,
"calib/mu_c": NaN,
"calib/mu_w": 0.4219834728020131,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.4219834728020131,
"calib/std_conf": 0.4007358049345676,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 577.13671875,
"completions/mean_terminated_length": 613.05810546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.000803845701739192,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0019,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0013398993760347366,
"mask/share_reasoning": 0.9395236968994141,
"mask/share_step_conf": 0.0005426329444162548,
"num_tokens": 26984475.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.05078125,
"calib/ece": 0.9586917106662725,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9586917106662725,
"calib/mu_c": NaN,
"calib/mu_w": 0.9586917106662725,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.0390625,
"calib/pce": 0.9586917106662725,
"calib/std_conf": 0.011406376782531445,
"calib/step_conf_rate": 0.0390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 568.7265625,
"completions/mean_terminated_length": 614.3206176757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.1088,
"grad_norm": 0.0,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0012479554861783981,
"mask/share_reasoning": 0.9224805235862732,
"mask/share_step_conf": 0.002052756492048502,
"num_tokens": 27236765.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.038129813969135284,
"adv/mean_abs_reasoning": 0.03864803910255432,
"adv/mean_abs_step_conf": 0.03862028568983078,
"adv/ratio_final_to_reasoning": 0.9865911661897282,
"adv/ratio_step_to_reasoning": 0.9992818933801558,
"adv/std_final_conf": 0.23103944957256317,
"adv/std_reasoning": 0.23417921364307404,
"adv/std_step_conf": 0.23401108384132385,
"calib/answer_extract_rate": 0.05859375,
"calib/avg_num_step_conf": 0.078125,
"calib/ece": 0.9624061285918764,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9624061285918764,
"calib/mu_c": NaN,
"calib/mu_w": 0.9624061285918764,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.07421875,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.9624061285918764,
"calib/std_conf": 0.0068011903033952095,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 604.109375,
"completions/mean_terminated_length": 623.5967407226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.05884866788983345,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.3036,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0016500115161761642,
"mask/share_reasoning": 0.9648259878158569,
"mask/share_step_conf": 0.0022740354761481285,
"num_tokens": 27495969.0,
"reward": 0.006301497109234333,
"reward_std": 0.017823325470089912,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0006331427721306682,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.004473922774195671,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.01893104799091816,
"adv/mean_abs_reasoning": 0.038653504103422165,
"adv/mean_abs_step_conf": 0.01924740895628929,
"adv/ratio_final_to_reasoning": 0.4897627894295387,
"adv/ratio_step_to_reasoning": 0.4979473246407493,
"adv/std_final_conf": 0.16222229599952698,
"adv/std_reasoning": 0.23421232402324677,
"adv/std_step_conf": 0.1649332195520401,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.70578125,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.03437499999999982,
"calib/mean_conf": 0.95578125,
"calib/mu_c": 0.9300000000000002,
"calib/mu_w": 0.964375,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.70578125,
"calib/std_conf": 0.01732825884754432,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 649.31640625,
"completions/mean_terminated_length": 689.7303466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.054028674960136414,
"learning_rate": 2.666666666666667e-06,
"loss": -0.1354,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0006737759103998542,
"mask/share_reasoning": 0.9399489164352417,
"mask/share_step_conf": 0.0007835605647414923,
"num_tokens": 27768874.0,
"reward": 0.006154080852866173,
"reward_std": 0.017406370490789413,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00020713958656415343,
"rewards/format_reward_step": 0.00390625,
"rewards/stepwise_brier_reward": 0.00048584199976176023,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.03664717078208923,
"adv/mean_abs_reasoning": 0.03864803910255432,
"adv/mean_abs_step_conf": 0.03860872983932495,
"adv/ratio_final_to_reasoning": 0.948228464705397,
"adv/ratio_step_to_reasoning": 0.9989828911338797,
"adv/std_final_conf": 0.22227756679058075,
"adv/std_reasoning": 0.23417921364307404,
"adv/std_step_conf": 0.23394106328487396,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.03125,
"calib/ece": 0.9681625662625422,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9681625662625422,
"calib/mu_c": NaN,
"calib/mu_w": 0.9681625662625422,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.9681625662625422,
"calib/std_conf": 0.026415746987534794,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 580.0,
"completions/mean_terminated_length": 611.02880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.112,
"grad_norm": 0.11139997094869614,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.2799,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.000729624240193516,
"mask/share_reasoning": 0.9466995000839233,
"mask/share_step_conf": 0.0017896032659336925,
"num_tokens": 28023114.0,
"reward": 0.005710248835384846,
"reward_std": 0.016151022166013718,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00048408948350697756,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.00336595275439322,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.9900000000000001,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9900000000000001,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2904.0,
"completions/max_terminated_length": 2904.0,
"completions/mean_length": 649.3984375,
"completions/mean_terminated_length": 678.5550537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.0,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0001707650226308033,
"mask/share_reasoning": 0.9564396142959595,
"mask/share_step_conf": 0.0004209116450510919,
"num_tokens": 28293944.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.018899209797382355,
"adv/mean_abs_reasoning": 0.01932401955127716,
"adv/mean_abs_step_conf": 0.019282689318060875,
"adv/ratio_final_to_reasoning": 0.978016491198037,
"adv/ratio_step_to_reasoning": 0.9978611989546681,
"adv/std_final_conf": 0.16194945573806763,
"adv/std_reasoning": 0.16558970510959625,
"adv/std_step_conf": 0.16523553431034088,
"calib/answer_extract_rate": 0.03515625,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.9751562500000001,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9751562500000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.9751562500000001,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.9751562500000001,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 603.23828125,
"completions/mean_terminated_length": 638.1363525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.02593773417174816,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.1541,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00016640975081827492,
"mask/share_reasoning": 0.944463849067688,
"mask/share_step_conf": 0.0006822873838245869,
"num_tokens": 28552989.0,
"reward": 0.0024086986668407917,
"reward_std": 0.006812828592956066,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00019168081053067,
"rewards/format_reward_step": 0.00390625,
"rewards/stepwise_brier_reward": 0.0008153068483807147,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.612890625,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.612890625,
"calib/mu_c": NaN,
"calib/mu_w": 0.612890625,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.612890625,
"calib/std_conf": 0.31289062500000003,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2936.0,
"completions/max_terminated_length": 2936.0,
"completions/mean_length": 627.7890625,
"completions/mean_terminated_length": 658.6638793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1152,
"grad_norm": 0.0,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.001333074877038598,
"mask/share_reasoning": 0.951507568359375,
"mask/share_step_conf": 0.000284366135019809,
"num_tokens": 28816935.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.02734375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 550.63671875,
"completions/mean_terminated_length": 582.49169921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.0007484604720957577,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0232,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9448825120925903,
"mask/share_step_conf": 0.0004299637221265584,
"num_tokens": 29062498.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.9771875,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9771875,
"calib/mu_c": NaN,
"calib/mu_w": 0.9771875,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.9771875,
"calib/std_conf": 0.007187500000000013,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2826.0,
"completions/max_terminated_length": 2826.0,
"completions/mean_length": 583.64453125,
"completions/mean_terminated_length": 612.3483276367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.0,
"learning_rate": 2.5e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0005077360547147691,
"mask/share_reasoning": 0.9521244764328003,
"mask/share_step_conf": 0.0004928014823235571,
"num_tokens": 29316831.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.038073450326919556,
"adv/mean_abs_reasoning": 0.03864803910255432,
"adv/mean_abs_step_conf": 0.038613706827163696,
"adv/ratio_final_to_reasoning": 0.9851327832154675,
"adv/ratio_step_to_reasoning": 0.9991116683747001,
"adv/std_final_conf": 0.23070098459720612,
"adv/std_reasoning": 0.23417921364307404,
"adv/std_step_conf": 0.23397119343280792,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.0390625,
"calib/ece": 0.9636030726162881,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9636030726162881,
"calib/mu_c": NaN,
"calib/mu_w": 0.9636030726162881,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.9636030726162881,
"calib/std_conf": 0.01469339308189049,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 629.72265625,
"completions/mean_terminated_length": 655.3211059570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1184,
"grad_norm": 0.03367357701063156,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.2922,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0008814925095066428,
"mask/share_reasoning": 0.9582992792129517,
"mask/share_step_conf": 0.0017567335162311792,
"num_tokens": 29585448.0,
"reward": 0.005740383639931679,
"reward_std": 0.016236256808042526,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0006421275902539492,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.003347203601151705,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 691.02734375,
"completions/mean_terminated_length": 731.0040893554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.0,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.943239688873291,
"mask/share_step_conf": 0.0020727741066366434,
"num_tokens": 29870271.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.018980545923113823,
"adv/mean_abs_reasoning": 0.01932401955127716,
"adv/mean_abs_step_conf": 0.01931268535554409,
"adv/ratio_final_to_reasoning": 0.9822255598918271,
"adv/ratio_step_to_reasoning": 0.9994134659353353,
"adv/std_final_conf": 0.16264642775058746,
"adv/std_reasoning": 0.16558970510959625,
"adv/std_step_conf": 0.16549257934093475,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.9697435897435898,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9697435897435898,
"calib/mu_c": NaN,
"calib/mu_w": 0.9697435897435898,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.9697435897435898,
"calib/std_conf": 0.0003626188621468834,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 615.72265625,
"completions/mean_terminated_length": 643.3673095703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.02239268273115158,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.1121,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0005311921122483909,
"mask/share_reasoning": 0.9556958675384521,
"mask/share_step_conf": 0.0008042484987527132,
"num_tokens": 30133096.0,
"reward": 0.00297063821926713,
"reward_std": 0.008402233012020588,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00023668639187235385,
"rewards/format_reward_step": 0.00390625,
"rewards/stepwise_brier_reward": 0.0019166830461472273,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.02734375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 632.4140625,
"completions/mean_terminated_length": 663.516357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1216,
"grad_norm": 0.0,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9525699615478516,
"mask/share_step_conf": 0.0005550722125917673,
"num_tokens": 30400018.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.05078125,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.9500000000000001,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9500000000000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.9500000000000001,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9500000000000001,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 637.1953125,
"completions/mean_terminated_length": 691.1949462890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.0,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.001953125,
"mask/share_reasoning": 0.9198802709579468,
"mask/share_step_conf": 4.1567218431737274e-05,
"num_tokens": 30668404.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0390625,
"calib/avg_num_step_conf": 0.02734375,
"calib/ece": 0.9651347030478711,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9651347030478711,
"calib/mu_c": NaN,
"calib/mu_w": 0.9651347030478711,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0546875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.9651347030478711,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2935.0,
"completions/max_terminated_length": 2935.0,
"completions/mean_length": 646.5546875,
"completions/mean_terminated_length": 681.1439819335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.0,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00024069320352282375,
"mask/share_reasoning": 0.9482711553573608,
"mask/share_step_conf": 0.0007068718550726771,
"num_tokens": 30938442.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.03125,
"calib/ece": 0.9606285696295394,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9606285696295394,
"calib/mu_c": NaN,
"calib/mu_w": 0.9606285696295394,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.9606285696295394,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 643.56640625,
"completions/mean_terminated_length": 686.4708862304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1248,
"grad_norm": 0.0,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00019975141913164407,
"mask/share_reasoning": 0.9363988041877747,
"mask/share_step_conf": 0.0009014662355184555,
"num_tokens": 31209795.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2917.0,
"completions/max_terminated_length": 2917.0,
"completions/mean_length": 579.546875,
"completions/mean_terminated_length": 620.7698364257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.0005588608328253031,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0076,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9332329034805298,
"mask/share_step_conf": 0.00036084410385228693,
"num_tokens": 31462167.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 628.5234375,
"completions/mean_terminated_length": 659.4343872070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.0,
"learning_rate": 2.25e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0003443797177169472,
"mask/share_reasoning": 0.9520903825759888,
"mask/share_step_conf": 0.0006902526365593076,
"num_tokens": 31728133.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 563.96484375,
"completions/mean_terminated_length": 604.0794677734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.128,
"grad_norm": 0.0,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9334514737129211,
"mask/share_step_conf": 0.0001422833011019975,
"num_tokens": 31979196.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 626.875,
"completions/mean_terminated_length": 660.4114990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.0,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9491629600524902,
"mask/share_step_conf": 5.580885408562608e-05,
"num_tokens": 32244732.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.9659777386006246,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9659777386006246,
"calib/mu_c": NaN,
"calib/mu_w": 0.9659777386006246,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.9659777386006246,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 575.4375,
"completions/mean_terminated_length": 629.5385131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.0,
"learning_rate": 2.166666666666667e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0003627232217695564,
"mask/share_reasoning": 0.9131672382354736,
"mask/share_step_conf": 0.0005325586535036564,
"num_tokens": 32499388.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2886.0,
"completions/max_terminated_length": 2886.0,
"completions/mean_length": 637.34765625,
"completions/mean_terminated_length": 668.6925659179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1312,
"grad_norm": 0.0,
"learning_rate": 2.138888888888889e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9520647525787354,
"mask/share_step_conf": 0.00106026791036129,
"num_tokens": 32767837.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 561.5703125,
"completions/mean_terminated_length": 601.5146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.0,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9335083961486816,
"mask/share_step_conf": 8.535483357263729e-05,
"num_tokens": 33018415.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 659.74609375,
"completions/mean_terminated_length": 703.7291870117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.0,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9373856782913208,
"mask/share_step_conf": 0.00011434726184234023,
"num_tokens": 33292118.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2852.0,
"completions/max_terminated_length": 2852.0,
"completions/mean_length": 635.34375,
"completions/mean_terminated_length": 674.8880004882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.1344,
"grad_norm": 0.0,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9414001703262329,
"mask/share_step_conf": 6.056201527826488e-06,
"num_tokens": 33560230.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 663.05078125,
"completions/mean_terminated_length": 719.2415161132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.0006136018200777471,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0093,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9217934012413025,
"mask/share_step_conf": 8.160474681062624e-05,
"num_tokens": 33833643.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2895.0,
"completions/max_terminated_length": 2895.0,
"completions/mean_length": 627.9453125,
"completions/mean_terminated_length": 661.5390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.94921875,
"mask/share_step_conf": 0.0,
"num_tokens": 34101061.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.02734375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2926.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 692.4921875,
"completions/mean_terminated_length": 723.5836181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1376,
"grad_norm": 0.0,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9545925855636597,
"mask/share_step_conf": 0.0024386178702116013,
"num_tokens": 34380723.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 536.00390625,
"completions/mean_terminated_length": 581.427978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.0,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9218621253967285,
"mask/share_step_conf": 1.2865376447734889e-05,
"num_tokens": 34623228.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 613.83984375,
"completions/mean_terminated_length": 652.045654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.0,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9411472082138062,
"mask/share_step_conf": 0.0002590351505205035,
"num_tokens": 34886579.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 660.9765625,
"completions/mean_terminated_length": 705.0416870117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1408,
"grad_norm": 0.0,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9372943639755249,
"mask/share_step_conf": 0.00020565465092658997,
"num_tokens": 35161381.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 708.54296875,
"completions/mean_terminated_length": 740.3550415039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.0,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.956986129283905,
"mask/share_step_conf": 4.512997475103475e-05,
"num_tokens": 35449112.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0078125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 702.57421875,
"completions/mean_terminated_length": 762.1143798828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.0,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9218196272850037,
"mask/share_step_conf": 5.537808465305716e-05,
"num_tokens": 35737923.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 672.32421875,
"completions/mean_terminated_length": 708.2921752929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.144,
"grad_norm": 0.0005483909044414759,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0099,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9491338729858398,
"mask/share_step_conf": 8.486342267133296e-05,
"num_tokens": 36015918.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.975,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.975,
"calib/mu_c": NaN,
"calib/mu_w": 0.975,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.975,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 612.83203125,
"completions/mean_terminated_length": 653.6875610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.0,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 5.321866410668008e-05,
"mask/share_reasoning": 0.9371010661125183,
"mask/share_step_conf": 0.0003457071434240788,
"num_tokens": 36281291.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0078125,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 668.94140625,
"completions/mean_terminated_length": 701.8401489257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.0,
"learning_rate": 1.75e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9511955976486206,
"mask/share_step_conf": 0.001929364399984479,
"num_tokens": 36559524.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 593.125,
"completions/mean_terminated_length": 624.85595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1472,
"grad_norm": 0.0,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.948685884475708,
"mask/share_step_conf": 0.000532844103872776,
"num_tokens": 36815700.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 599.9375,
"completions/mean_terminated_length": 639.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.0,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9371291399002075,
"mask/share_step_conf": 0.0003708239528350532,
"num_tokens": 37072380.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 653.01953125,
"completions/mean_terminated_length": 685.1351928710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9528490900993347,
"mask/share_step_conf": 0.0002759067574515939,
"num_tokens": 37344569.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3064.0,
"completions/max_terminated_length": 3064.0,
"completions/mean_length": 711.59765625,
"completions/mean_terminated_length": 755.8880004882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1504,
"grad_norm": 0.0,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9413928985595703,
"mask/share_step_conf": 1.3360410775931086e-05,
"num_tokens": 37633834.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2970.0,
"completions/max_terminated_length": 2970.0,
"completions/mean_length": 685.59765625,
"completions/mean_terminated_length": 731.30419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.0,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9340871572494507,
"mask/share_step_conf": 0.003412847174331546,
"num_tokens": 37914507.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 656.6953125,
"completions/mean_terminated_length": 691.8271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.0,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9482780694961548,
"mask/share_step_conf": 0.0009406713652424514,
"num_tokens": 38189957.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.9940721006120699,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9940721006120699,
"calib/mu_c": NaN,
"calib/mu_w": 0.9940721006120699,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9940721006120699,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 656.30859375,
"completions/mean_terminated_length": 700.0625610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1536,
"grad_norm": 0.001189550501294434,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0069,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0003620766510721296,
"mask/share_reasoning": 0.93581223487854,
"mask/share_step_conf": 0.0013256651582196355,
"num_tokens": 38462100.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.02734375,
"calib/ece": 0.96,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.96,
"calib/mu_c": NaN,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.96,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2975.0,
"completions/max_terminated_length": 2975.0,
"completions/mean_length": 635.6953125,
"completions/mean_terminated_length": 669.7036743164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.0,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 8.281802001874894e-05,
"mask/share_reasoning": 0.9487344026565552,
"mask/share_step_conf": 0.00040147791150957346,
"num_tokens": 38727542.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 636.84375,
"completions/mean_terminated_length": 679.300048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.0,
"learning_rate": 1.5e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9373124837875366,
"mask/share_step_conf": 0.0001875000016298145,
"num_tokens": 38997790.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.0,
"calib/ece": 0.9807894736842104,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9807894736842104,
"calib/mu_c": NaN,
"calib/mu_w": 0.9807894736842104,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0078125,
"calib/nonempty_step_conf_rate": 0.0,
"calib/pce": 0.9807894736842104,
"calib/std_conf": 0.019210526315789234,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 636.046875,
"completions/mean_terminated_length": 659.2227172851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1568,
"grad_norm": 0.0,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00044946392881684005,
"mask/share_reasoning": 0.9642276763916016,
"mask/share_step_conf": 0.00016664052964188159,
"num_tokens": 39264298.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2599.0,
"completions/max_terminated_length": 2599.0,
"completions/mean_length": 586.65234375,
"completions/mean_terminated_length": 620.5908813476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.0,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9447327852249146,
"mask/share_step_conf": 0.0005797140765935183,
"num_tokens": 39519593.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.03865896910429001,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.2342454344034195,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3032.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 691.40234375,
"completions/mean_terminated_length": 737.495849609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.0006965881329961121,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0201,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9373958110809326,
"mask/share_step_conf": 0.00010416324221296236,
"num_tokens": 39801048.0,
"reward": 0.0078125,
"reward_std": 0.022097086533904076,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 619.40234375,
"completions/mean_terminated_length": 655.2355346679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16,
"grad_norm": 0.0,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9431923031806946,
"mask/share_step_conf": 0.002120216842740774,
"num_tokens": 40064575.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.94,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.94,
"calib/mu_c": NaN,
"calib/mu_w": 0.94,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.94,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2867.0,
"completions/max_terminated_length": 2867.0,
"completions/mean_length": 592.2265625,
"completions/mean_terminated_length": 639.70458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.0,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 4.515895852819085e-05,
"mask/share_reasoning": 0.9251866340637207,
"mask/share_step_conf": 0.0005494383512996137,
"num_tokens": 40323209.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.9842106360885977,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9842106360885977,
"calib/mu_c": NaN,
"calib/mu_w": 0.9842106360885977,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.9842106360885977,
"calib/std_conf": 0.011988413866375502,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 578.18359375,
"completions/mean_terminated_length": 619.3096313476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.0,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0008371968287974596,
"mask/share_reasoning": 0.9285774827003479,
"mask/share_step_conf": 0.004179063253104687,
"num_tokens": 40576616.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 654.390625,
"completions/mean_terminated_length": 678.23486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1632,
"grad_norm": 0.0,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9647525548934937,
"mask/share_step_conf": 9.118674643104896e-05,
"num_tokens": 40851460.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03515625,
"calib/avg_num_step_conf": 0.015625,
"calib/ece": 0.8750000000000002,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.8750000000000002,
"calib/mu_c": NaN,
"calib/mu_w": 0.8750000000000002,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.8750000000000002,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 655.62109375,
"completions/mean_terminated_length": 674.0521850585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.0,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.000894201803021133,
"mask/share_reasoning": 0.9707907438278198,
"mask/share_step_conf": 0.0009712825412862003,
"num_tokens": 41123739.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0078125,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 560.046875,
"completions/mean_terminated_length": 587.5901489257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.0,
"learning_rate": 1.25e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.953125,
"mask/share_step_conf": 0.0,
"num_tokens": 41374327.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3064.0,
"completions/max_terminated_length": 3064.0,
"completions/mean_length": 657.609375,
"completions/mean_terminated_length": 704.3849487304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1664,
"grad_norm": 0.0,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.931401789188385,
"mask/share_step_conf": 0.00219197035767138,
"num_tokens": 41647435.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.015625,
"calib/ece": 0.9226582608695653,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9226582608695653,
"calib/mu_c": NaN,
"calib/mu_w": 0.9226582608695653,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.9226582608695653,
"calib/std_conf": 0.002658260869565221,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2972.0,
"completions/max_terminated_length": 2972.0,
"completions/mean_length": 774.03515625,
"completions/mean_terminated_length": 839.63134765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.0,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0017013853648677468,
"mask/share_reasoning": 0.918377697467804,
"mask/share_step_conf": 0.0017959036631509662,
"num_tokens": 41949316.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.03515625,
"calib/ece": 0.960828827622024,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.960828827622024,
"calib/mu_c": NaN,
"calib/mu_w": 0.960828827622024,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.05859375,
"calib/nonempty_step_conf_rate": 0.03515625,
"calib/pce": 0.960828827622024,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.03515625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2955.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 692.8515625,
"completions/mean_terminated_length": 751.5678100585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00036790166632272303,
"mask/share_reasoning": 0.9196397066116333,
"mask/share_step_conf": 0.00186736264731735,
"num_tokens": 42231926.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.9999999999999998,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9999999999999998,
"calib/mu_c": NaN,
"calib/mu_w": 0.9999999999999998,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9999999999999998,
"calib/std_conf": 2.220446049250313e-16,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2957.0,
"completions/max_terminated_length": 2957.0,
"completions/mean_length": 576.9140625,
"completions/mean_terminated_length": 615.3750610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1696,
"grad_norm": 0.0,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0006256421329453588,
"mask/share_reasoning": 0.9368565082550049,
"mask/share_step_conf": 1.7836757251643576e-05,
"num_tokens": 42484400.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.02734375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 656.1875,
"completions/mean_terminated_length": 688.458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.0,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9526559114456177,
"mask/share_step_conf": 0.0004691215290222317,
"num_tokens": 42757224.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0390625,
"calib/avg_num_step_conf": 0.02734375,
"calib/ece": 0.8,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.8,
"calib/mu_c": NaN,
"calib/mu_w": 0.8,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.8,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 688.9296875,
"completions/mean_terminated_length": 728.7850952148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.00032964005367830396,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0086,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00021701389050576836,
"mask/share_reasoning": 0.9441901445388794,
"mask/share_step_conf": 0.000905350549146533,
"num_tokens": 43037510.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2744.0,
"completions/max_terminated_length": 2744.0,
"completions/mean_length": 574.69921875,
"completions/mean_terminated_length": 605.4443969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1728,
"grad_norm": 0.0,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9491317272186279,
"mask/share_step_conf": 8.699888712726533e-05,
"num_tokens": 43288777.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2926.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 560.73828125,
"completions/mean_terminated_length": 610.8468017578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.0,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.917188823223114,
"mask/share_step_conf": 0.000779931026045233,
"num_tokens": 43537158.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 613.52734375,
"completions/mean_terminated_length": 649.0206298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9443371295928955,
"mask/share_step_conf": 0.0009753695921972394,
"num_tokens": 43800357.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.96,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.96,
"calib/mu_c": NaN,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.96,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 606.296875,
"completions/mean_terminated_length": 654.9028930664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.176,
"grad_norm": 0.0,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 2.861721623048652e-05,
"mask/share_reasoning": 0.9257179498672485,
"mask/share_step_conf": 3.46535089192912e-05,
"num_tokens": 44061145.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.02734375,
"calib/ece": 0.9682673675233835,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9682673675233835,
"calib/mu_c": NaN,
"calib/mu_w": 0.9682673675233835,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.9682673675233835,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2894.0,
"completions/max_terminated_length": 2894.0,
"completions/mean_length": 637.0234375,
"completions/mean_terminated_length": 679.49169921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.0005020391545258462,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0082,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0030303029343485832,
"mask/share_reasoning": 0.9339636564254761,
"mask/share_step_conf": 0.0005060465191490948,
"num_tokens": 44330407.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 3000.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 612.921875,
"completions/mean_terminated_length": 667.693603515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.0,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.917394757270813,
"mask/share_step_conf": 0.0005739557673223317,
"num_tokens": 44592923.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 622.71875,
"completions/mean_terminated_length": 656.0328979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1792,
"grad_norm": 0.0005375173641368747,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0123,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9491442441940308,
"mask/share_step_conf": 7.449730765074492e-05,
"num_tokens": 44857011.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2974.0,
"completions/max_terminated_length": 2974.0,
"completions/mean_length": 688.1953125,
"completions/mean_terminated_length": 719.0938720703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.0,
"learning_rate": 8.611111111111112e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9561343193054199,
"mask/share_step_conf": 0.0008969124755822122,
"num_tokens": 45137373.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 634.4140625,
"completions/mean_terminated_length": 673.9004516601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.0,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.941338062286377,
"mask/share_step_conf": 6.818037945777178e-05,
"num_tokens": 45403935.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2730.0,
"completions/max_terminated_length": 2730.0,
"completions/mean_length": 555.0390625,
"completions/mean_terminated_length": 592.0416870117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1824,
"grad_norm": 0.0,
"learning_rate": 8.055555555555557e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9375,
"mask/share_step_conf": 0.0,
"num_tokens": 45652921.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3028.0,
"completions/max_terminated_length": 3028.0,
"completions/mean_length": 579.78125,
"completions/mean_terminated_length": 613.322265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.0,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9434047937393188,
"mask/share_step_conf": 0.0019076891476288438,
"num_tokens": 45904697.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 657.28125,
"completions/mean_terminated_length": 692.4443969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.0,
"learning_rate": 7.5e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.948871374130249,
"mask/share_step_conf": 0.0003474223776720464,
"num_tokens": 46176121.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01171875,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 678.28125,
"completions/mean_terminated_length": 723.5000610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1856,
"grad_norm": 0.0,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9375,
"mask/share_step_conf": 0.0,
"num_tokens": 46453993.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.02734375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2730.0,
"completions/max_terminated_length": 2730.0,
"completions/mean_length": 576.91796875,
"completions/mean_terminated_length": 615.3792114257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.0007179116364568472,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0247,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9356477856636047,
"mask/share_step_conf": 0.0018521937308833003,
"num_tokens": 46707508.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 661.20703125,
"completions/mean_terminated_length": 702.3610229492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9411953687667847,
"mask/share_step_conf": 0.00021085733897052705,
"num_tokens": 46980841.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2769.0,
"completions/max_terminated_length": 2769.0,
"completions/mean_length": 613.859375,
"completions/mean_terminated_length": 644.0491333007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1888,
"grad_norm": 0.0,
"learning_rate": 6.388888888888889e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0004226059827487916,
"mask/share_reasoning": 0.952595591545105,
"mask/share_step_conf": 0.00010681642743293196,
"num_tokens": 47241821.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2830.0,
"completions/max_terminated_length": 2830.0,
"completions/mean_length": 615.19140625,
"completions/mean_terminated_length": 650.7809448242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.001339837210252881,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0007,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.944941520690918,
"mask/share_step_conf": 0.000371001660823822,
"num_tokens": 47505382.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.018928933888673782,
"adv/mean_abs_reasoning": 0.01932401955127716,
"adv/mean_abs_step_conf": 0.01928865909576416,
"adv/ratio_final_to_reasoning": 0.9795546852167584,
"adv/ratio_step_to_reasoning": 0.9981701293864266,
"adv/std_final_conf": 0.16220416128635406,
"adv/std_reasoning": 0.16558970510959625,
"adv/std_step_conf": 0.16528668999671936,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.9391350031946583,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9391350031946583,
"calib/mu_c": NaN,
"calib/mu_w": 0.9391350031946583,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.9391350031946583,
"calib/std_conf": 0.03413500319465823,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3057.0,
"completions/max_terminated_length": 3057.0,
"completions/mean_length": 636.2421875,
"completions/mean_terminated_length": 667.5327758789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.06453783065080643,
"learning_rate": 5.833333333333334e-07,
"loss": -0.1462,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.001012221910059452,
"mask/share_reasoning": 0.9516113996505737,
"mask/share_step_conf": 0.0005013375193811953,
"num_tokens": 47774524.0,
"reward": 0.0024650082923471928,
"reward_std": 0.006972096394747496,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00020603708981070668,
"rewards/format_reward_step": 0.00390625,
"rewards/stepwise_brier_reward": 0.000920747930649668,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 647.69921875,
"completions/mean_terminated_length": 688.0125122070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.192,
"grad_norm": 0.0,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.938878059387207,
"mask/share_step_conf": 0.002528225537389517,
"num_tokens": 48044191.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 603.8125,
"completions/mean_terminated_length": 663.4163208007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.0012316078646108508,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0265,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9101225137710571,
"mask/share_step_conf": 3.375771484570578e-05,
"num_tokens": 48305031.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2782.0,
"completions/max_terminated_length": 2782.0,
"completions/mean_length": 574.73828125,
"completions/mean_terminated_length": 607.987548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.0,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9448782205581665,
"mask/share_step_conf": 0.0004343033069744706,
"num_tokens": 48558324.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.015625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 578.08984375,
"completions/mean_terminated_length": 611.5330200195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1952,
"grad_norm": 0.0,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9451446533203125,
"mask/share_step_conf": 0.0001678102562436834,
"num_tokens": 48812995.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 593.359375,
"completions/mean_terminated_length": 638.2353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9294921159744263,
"mask/share_step_conf": 0.0001953368482645601,
"num_tokens": 49070175.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 627.8359375,
"completions/mean_terminated_length": 666.9129028320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.0,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9410565495491028,
"mask/share_step_conf": 0.000349695939803496,
"num_tokens": 49337821.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 635.94140625,
"completions/mean_terminated_length": 675.5228271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1984,
"grad_norm": 0.0,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9408471584320068,
"mask/share_step_conf": 0.0005590926157310605,
"num_tokens": 49605662.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 722.9453125,
"completions/mean_terminated_length": 774.3681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.000874101126100868,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0042,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9335595369338989,
"mask/share_step_conf": 3.423374073463492e-05,
"num_tokens": 49892280.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03515625,
"calib/avg_num_step_conf": 0.0390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 698.91796875,
"completions/mean_terminated_length": 739.3511962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9445304870605469,
"mask/share_step_conf": 0.0007820092723704875,
"num_tokens": 50175275.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 602.7265625,
"completions/mean_terminated_length": 622.1693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.2016,
"grad_norm": 0.0,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9686535596847534,
"mask/share_step_conf": 9.648424020269886e-05,
"num_tokens": 50437341.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.9928518292682926,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9928518292682926,
"calib/mu_c": NaN,
"calib/mu_w": 0.9928518292682926,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9928518292682926,
"calib/std_conf": 0.007148170731707315,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 573.69140625,
"completions/mean_terminated_length": 614.4978637695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.0,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00041248349589295685,
"mask/share_reasoning": 0.9331349730491638,
"mask/share_step_conf": 4.62801763205789e-05,
"num_tokens": 50689814.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.01899779960513115,
"adv/mean_abs_reasoning": 0.038653504103422165,
"adv/mean_abs_step_conf": 0.019316211342811584,
"adv/ratio_final_to_reasoning": 0.49148971214356707,
"adv/ratio_step_to_reasoning": 0.49972730263027915,
"adv/std_final_conf": 0.162794291973114,
"adv/std_reasoning": 0.23421232402324677,
"adv/std_step_conf": 0.16552278399467468,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.9688,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9688,
"calib/mu_c": NaN,
"calib/mu_w": 0.9688,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.9688,
"calib/std_conf": 0.0011999999999999789,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 630.11328125,
"completions/mean_terminated_length": 661.1024169921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.0887182429432869,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.1527,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0001581465476192534,
"mask/share_reasoning": 0.9523420333862305,
"mask/share_step_conf": 0.0006248306017369032,
"num_tokens": 50955291.0,
"reward": 0.007060363423079252,
"reward_std": 0.019969724118709564,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00024902436416596174,
"rewards/format_reward_step": 0.00390625,
"rewards/stepwise_brier_reward": 0.0022774646058678627,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.00390625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0078125,
"calib/nonempty_step_conf_rate": 0.00390625,
"calib/step_conf_rate": 0.00390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2686.0,
"completions/max_terminated_length": 2686.0,
"completions/mean_length": 590.32421875,
"completions/mean_terminated_length": 627.06640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.2048,
"grad_norm": 0.0,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9413959980010986,
"mask/share_step_conf": 1.0268796359014232e-05,
"num_tokens": 51211390.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.02734375,
"calib/ece": 0.9600000000000001,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9600000000000001,
"calib/mu_c": NaN,
"calib/mu_w": 0.9600000000000001,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.9600000000000001,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3013.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 637.46484375,
"completions/mean_terminated_length": 663.3780517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.0,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00012885199976153672,
"mask/share_reasoning": 0.960355281829834,
"mask/share_step_conf": 0.0004534159670583904,
"num_tokens": 51480293.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.96,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.96,
"calib/mu_c": NaN,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.96,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 656.734375,
"completions/mean_terminated_length": 709.3839111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.0008238269947469234,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0312,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00026483050896786153,
"mask/share_reasoning": 0.9254249930381775,
"mask/share_step_conf": 9.144169598584995e-05,
"num_tokens": 51754361.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019329484552145004,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16563653945922852,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0078125,
"calib/ece": 0.9601074217378754,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.9601074217378754,
"calib/mu_c": NaN,
"calib/mu_w": 0.9601074217378754,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.0078125,
"calib/pce": 0.9601074217378754,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3014.0,
"completions/max_terminated_length": 3014.0,
"completions/mean_length": 696.2890625,
"completions/mean_terminated_length": 742.7083740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.208,
"grad_norm": 0.000848722702357918,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0067,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0006241945666261017,
"mask/share_reasoning": 0.9368205070495605,
"mask/share_step_conf": 5.5262891692109406e-05,
"num_tokens": 52038595.0,
"reward": 0.00390625,
"reward_std": 0.011048543266952038,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.03125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.046875,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3043.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 692.03125,
"completions/mean_terminated_length": 726.0655517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.0,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9523943662643433,
"mask/share_step_conf": 0.0007306202314794064,
"num_tokens": 52318299.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.0234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 619.29296875,
"completions/mean_terminated_length": 666.1303100585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.0,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9287298321723938,
"mask/share_step_conf": 0.0009576534503139555,
"num_tokens": 52581894.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.01171875,
"calib/ece": 0.97,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.97,
"calib/mu_c": NaN,
"calib/mu_w": 0.97,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.97,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 646.1484375,
"completions/mean_terminated_length": 683.5288696289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.2112,
"grad_norm": 0.0,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00013048943947069347,
"mask/share_reasoning": 0.9449419379234314,
"mask/share_step_conf": 0.00024005374871194363,
"num_tokens": 52852692.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.01953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 635.00390625,
"completions/mean_terminated_length": 683.0294189453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.0,
"learning_rate": 2.777777777777778e-08,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9284061789512634,
"mask/share_step_conf": 0.0012813331559300423,
"num_tokens": 53119453.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3007.0,
"completions/max_terminated_length": 3007.0,
"completions/mean_length": 669.09375,
"completions/mean_terminated_length": 713.7000122070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9339677095413208,
"mask/share_step_conf": 0.003532242262735963,
"num_tokens": 53398789.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/stepwise_brier_reward": 0.0,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.40800720210187136,
"train_runtime": 11790.426,
"train_samples_per_second": 4.343,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 53398789,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}