Files
PureRL-1.5B-v7-s2-margin-ma…/trainer_state.json
ModelHub XC 6c78080a7b 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-margin-maskoff
Source: Original Platform
2026-06-04 16:45:36 +08:00

9843 lines
384 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.0070291250012815,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0516,
"num_tokens": 229171.0,
"reward": 0.5306904315948486,
"reward_std": 0.15138749778270721,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.016632115468382835,
"step": 1
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.007271615322679281,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0125,
"num_tokens": 458661.0,
"reward": 0.47535353899002075,
"reward_std": 0.15537551045417786,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.012380896136164665,
"step": 2
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.49794238683127573,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.24564705882352944,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": -0.0007188371166866325,
"calib/mean_conf": 0.8809411764705882,
"calib/mu_c": 0.8806790123456791,
"calib/mu_w": 0.8813978494623658,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24564705882352944,
"calib/std_conf": 0.04406653248090241,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7875328083989501,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.028252365594522044,
"calib/step_q_w": 0.7592804428044281,
"calib/step_q_w_n": 542.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1591.0,
"completions/max_terminated_length": 1591.0,
"completions/mean_length": 502.5078125,
"completions/mean_terminated_length": 504.47845458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.007185729686170816,
"kl": 0.0012685656547546387,
"learning_rate": 7.5e-07,
"loss": 0.0124,
"num_tokens": 692559.0,
"reward": 0.5189927220344543,
"reward_std": 0.13065262138843536,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7026242017745972,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.01036119181662798,
"step": 3
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4632880529432254,
"calib/avg_num_step_conf": 5.19921875,
"calib/ece": 0.22222222222222235,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.30158730158730157,
"calib/gap": 0.0034691745036575794,
"calib/mean_conf": 0.876984126984127,
"calib/mu_c": 0.8781818181818184,
"calib/mu_w": 0.8747126436781608,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22222222222222235,
"calib/std_conf": 0.05825191380832978,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7934677419354839,
"calib/step_q_c_n": 868.0,
"calib/step_q_gap": 0.014720441719501154,
"calib/step_q_w": 0.7787473002159827,
"calib/step_q_w_n": 463.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2456.0,
"completions/max_terminated_length": 2456.0,
"completions/mean_length": 523.35546875,
"completions/mean_terminated_length": 525.4078979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.007038692943751812,
"kl": 0.0002740919589996338,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.046,
"num_tokens": 932706.0,
"reward": 0.5242385268211365,
"reward_std": 0.1526789367198944,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7104023694992065,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.013855919241905212,
"step": 4
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.4222260040844112,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.3433333333333333,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.2674897119341564,
"calib/gap": -0.012205582028590789,
"calib/mean_conf": 0.8779835390946501,
"calib/mu_c": 0.8723076923076923,
"calib/mu_w": 0.8845132743362831,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.34316872427983536,
"calib/std_conf": 0.04670685809638801,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7886736214605068,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.007429679618826568,
"calib/step_q_w": 0.7812439418416802,
"calib/step_q_w_n": 619.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2678.0,
"completions/max_terminated_length": 2678.0,
"completions/mean_length": 538.12109375,
"completions/mean_terminated_length": 542.3582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.0068595572374761105,
"kl": 0.00029274821281433105,
"learning_rate": 1.25e-06,
"loss": 0.0803,
"num_tokens": 1177153.0,
"reward": 0.44816750288009644,
"reward_std": 0.13739368319511414,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5918129086494446,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.014678382314741611,
"step": 5
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4842447587774691,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.2972156862745098,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.25882352941176473,
"calib/gap": -0.002398964384945579,
"calib/mean_conf": 0.8758039215686274,
"calib/mu_c": 0.8747972972972973,
"calib/mu_w": 0.8771962616822429,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2963137254901961,
"calib/std_conf": 0.04259292936251419,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7968199233716474,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.0017872436330853558,
"calib/step_q_w": 0.7950326797385621,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 451.4921875,
"completions/mean_terminated_length": 451.4921875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.0064,
"grad_norm": 0.008030211552977562,
"kl": 0.00039631128311157227,
"learning_rate": 1.5e-06,
"loss": 0.0086,
"num_tokens": 1398687.0,
"reward": 0.4995495676994324,
"reward_std": 0.11786103248596191,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6636097431182861,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.02064560167491436,
"step": 6
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.45714188914487064,
"calib/avg_num_step_conf": 5.5078125,
"calib/ece": 0.25796812749004,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3466135458167331,
"calib/gap": -0.006704160455346164,
"calib/mean_conf": 0.8834661354581673,
"calib/mu_c": 0.8809554140127389,
"calib/mu_w": 0.8876595744680851,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25796812749004,
"calib/std_conf": 0.04749017353386353,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8038183934807916,
"calib/step_q_c_n": 859.0,
"calib/step_q_gap": 0.02223944611237061,
"calib/step_q_w": 0.781578947368421,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2374.0,
"completions/max_terminated_length": 2374.0,
"completions/mean_length": 542.875,
"completions/mean_terminated_length": 545.0039672851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.006746398750692606,
"kl": 0.005849212408065796,
"learning_rate": 1.75e-06,
"loss": 0.0181,
"num_tokens": 1645087.0,
"reward": 0.5083991289138794,
"reward_std": 0.14332298934459686,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6798004508018494,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.01902913488447666,
"step": 7
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5029393173198482,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.31596837944664025,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2766798418972332,
"calib/gap": 0.01349873577749694,
"calib/mean_conf": 0.8693280632411067,
"calib/mu_c": 0.875357142857143,
"calib/mu_w": 0.861858407079646,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31596837944664025,
"calib/std_conf": 0.08297782385661961,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7995081967213116,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.023086792039037296,
"calib/step_q_w": 0.7764214046822743,
"calib/step_q_w_n": 598.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2356.0,
"completions/max_terminated_length": 2356.0,
"completions/mean_length": 528.59765625,
"completions/mean_terminated_length": 528.59765625,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.006836770102381706,
"kl": 0.0005608052015304565,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0212,
"num_tokens": 1886920.0,
"reward": 0.4802365303039551,
"reward_std": 0.1404484510421753,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6440644264221191,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.01015863474458456,
"step": 8
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.45059007053716765,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.26429718875502006,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.24497991967871485,
"calib/gap": -0.004903011394465584,
"calib/mean_conf": 0.8747389558232932,
"calib/mu_c": 0.872828947368421,
"calib/mu_w": 0.8777319587628866,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.26429718875502006,
"calib/std_conf": 0.04688737390643053,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7938685015290521,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.06206422018348634,
"calib/step_q_w": 0.7318042813455657,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 504.71484375,
"completions/mean_terminated_length": 508.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.0096,
"grad_norm": 0.007662550546228886,
"kl": 0.00041669607162475586,
"learning_rate": 2.25e-06,
"loss": 0.0244,
"num_tokens": 2123663.0,
"reward": 0.4904530942440033,
"reward_std": 0.15297287702560425,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6537765860557556,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.016973400488495827,
"step": 9
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5013053998981152,
"calib/avg_num_step_conf": 5.03515625,
"calib/ece": 0.2852549019607844,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.23529411764705882,
"calib/gap": 0.0006628884360672105,
"calib/mean_conf": 0.8774117647058823,
"calib/mu_c": 0.877682119205298,
"calib/mu_w": 0.8770192307692308,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2852549019607844,
"calib/std_conf": 0.04070418221543076,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.789935373645681,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 1.3802463411094479e-06,
"calib/step_q_w": 0.7899339933993399,
"calib/step_q_w_n": 606.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2195.0,
"completions/max_terminated_length": 2195.0,
"completions/mean_length": 510.5390625,
"completions/mean_terminated_length": 510.5390625,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.007049976848065853,
"kl": 0.00035446882247924805,
"learning_rate": 2.5e-06,
"loss": 0.0477,
"num_tokens": 2361161.0,
"reward": 0.5098337531089783,
"reward_std": 0.1368577778339386,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6724746227264404,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.030786586925387383,
"step": 10
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.37179744464512676,
"calib/avg_num_step_conf": 5.45703125,
"calib/ece": 0.28284,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.312,
"calib/gap": -0.023507258010569188,
"calib/mean_conf": 0.8796399999999999,
"calib/mu_c": 0.8703311258278145,
"calib/mu_w": 0.8938383838383837,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.27924,
"calib/std_conf": 0.05583072988955097,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7897932816537467,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": -0.005856798603074975,
"calib/step_q_w": 0.7956500802568217,
"calib/step_q_w_n": 623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 531.0703125,
"completions/mean_terminated_length": 537.3676147460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.006753360386937857,
"kl": 0.0003243088722229004,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.019,
"num_tokens": 2601595.0,
"reward": 0.49244898557662964,
"reward_std": 0.14083197712898254,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6497530937194824,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.023426111787557602,
"step": 11
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.46517780717453894,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.1634523809523809,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2976190476190476,
"calib/gap": -0.007576842269084061,
"calib/mean_conf": 0.8763888888888889,
"calib/mu_c": 0.8742541436464089,
"calib/mu_w": 0.8818309859154929,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16079365079365074,
"calib/std_conf": 0.05049021156466664,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7882559456398641,
"calib/step_q_c_n": 883.0,
"calib/step_q_gap": 0.0034418192458119945,
"calib/step_q_w": 0.7848141263940521,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 484.59765625,
"completions/mean_terminated_length": 486.4980773925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0128,
"grad_norm": 0.007357874885201454,
"kl": 0.0006407797336578369,
"learning_rate": 3e-06,
"loss": 0.0476,
"num_tokens": 2829828.0,
"reward": 0.5489996671676636,
"reward_std": 0.1275169402360916,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7473390698432922,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.013941464945673943,
"step": 12
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5545263559969442,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.2750390625,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.26953125,
"calib/gap": 0.005106951871657883,
"calib/mean_conf": 0.8766015625000001,
"calib/mu_c": 0.8786363636363637,
"calib/mu_w": 0.8735294117647058,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2750390625,
"calib/std_conf": 0.047841835746118624,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7919701086956523,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.014352847907659894,
"calib/step_q_w": 0.7776172607879924,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1362.0,
"completions/max_terminated_length": 1362.0,
"completions/mean_length": 456.11328125,
"completions/mean_terminated_length": 457.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.00735123734921217,
"kl": 0.0006067156791687012,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0061,
"num_tokens": 3051185.0,
"reward": 0.5164021253585815,
"reward_std": 0.1543343961238861,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6848277449607849,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.027664033696055412,
"step": 13
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4406148867313916,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.2902766798418972,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": -0.005965695792880088,
"calib/mean_conf": 0.8831620553359684,
"calib/mu_c": 0.8807333333333335,
"calib/mu_w": 0.8866990291262136,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2902766798418972,
"calib/std_conf": 0.044155239785541624,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7921544209215442,
"calib/step_q_c_n": 803.0,
"calib/step_q_gap": 0.01664199856129578,
"calib/step_q_w": 0.7755124223602484,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 544.5859375,
"completions/mean_terminated_length": 546.7216186523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.007078804075717926,
"kl": 0.0010630488395690918,
"learning_rate": 3.5e-06,
"loss": -0.0326,
"num_tokens": 3295999.0,
"reward": 0.49770310521125793,
"reward_std": 0.1487538069486618,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.661691427230835,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.018871046602725983,
"step": 14
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.46976936799184504,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.31177865612648215,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.33992094861660077,
"calib/gap": -0.003974898063200838,
"calib/mean_conf": 0.8809486166007905,
"calib/mu_c": 0.8792361111111111,
"calib/mu_w": 0.883211009174312,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31177865612648215,
"calib/std_conf": 0.04669183862870357,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7845135135135135,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": -0.008341127467222043,
"calib/step_q_w": 0.7928546409807355,
"calib/step_q_w_n": 571.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 471.171875,
"completions/mean_terminated_length": 473.0196228027344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.016,
"grad_norm": 0.007281082682311535,
"kl": 0.0008074045181274414,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0353,
"num_tokens": 3524499.0,
"reward": 0.4907722473144531,
"reward_std": 0.1448393166065216,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6457914113998413,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.025596803054213524,
"step": 15
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47812584391034296,
"calib/avg_num_step_conf": 6.42578125,
"calib/ece": 0.24822134387351769,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3241106719367589,
"calib/gap": -0.003385093167701858,
"calib/mean_conf": 0.884584980237154,
"calib/mu_c": 0.8833540372670807,
"calib/mu_w": 0.8867391304347826,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24822134387351769,
"calib/std_conf": 0.046857003177099794,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7781346678798908,
"calib/step_q_c_n": 1099.0,
"calib/step_q_gap": 0.02170609645131938,
"calib/step_q_w": 0.7564285714285715,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2532.0,
"completions/max_terminated_length": 2532.0,
"completions/mean_length": 634.10546875,
"completions/mean_terminated_length": 634.10546875,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.006321294233202934,
"kl": 0.0008182525634765625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0243,
"num_tokens": 3795678.0,
"reward": 0.5235186219215393,
"reward_std": 0.1506902426481247,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6949781179428101,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.02862163446843624,
"step": 16
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5686486486486487,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.14972549019607842,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.25098039215686274,
"calib/gap": 0.010729729729729853,
"calib/mean_conf": 0.8727843137254903,
"calib/mu_c": 0.8757297297297297,
"calib/mu_w": 0.8649999999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14850980392156862,
"calib/std_conf": 0.04758163829630502,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7728096118299445,
"calib/step_q_c_n": 1082.0,
"calib/step_q_gap": 0.018269221857799667,
"calib/step_q_w": 0.7545403899721448,
"calib/step_q_w_n": 359.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1184.0,
"completions/max_terminated_length": 1184.0,
"completions/mean_length": 497.2734375,
"completions/mean_terminated_length": 499.22357177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.007205154746770859,
"kl": 0.001395106315612793,
"learning_rate": 4.25e-06,
"loss": -0.0207,
"num_tokens": 4026508.0,
"reward": 0.5758857727050781,
"reward_std": 0.14907167851924896,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7742902636528015,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.03451257944107056,
"step": 17
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4860372340425532,
"calib/avg_num_step_conf": 5.578125,
"calib/ece": 0.32288537549407115,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.31225296442687744,
"calib/gap": -0.0030097517730496293,
"calib/mean_conf": 0.880197628458498,
"calib/mu_c": 0.8788652482269502,
"calib/mu_w": 0.8818749999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32288537549407115,
"calib/std_conf": 0.04862903533260498,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7488387978142076,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.019413510457885863,
"calib/step_q_w": 0.7294252873563217,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2594.0,
"completions/max_terminated_length": 2594.0,
"completions/mean_length": 522.1171875,
"completions/mean_terminated_length": 522.1171875,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0192,
"grad_norm": 0.007514473982155323,
"kl": 0.0019025802612304688,
"learning_rate": 4.5e-06,
"loss": 0.0443,
"num_tokens": 4270890.0,
"reward": 0.4936829209327698,
"reward_std": 0.134256511926651,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.637388288974762,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.042946361005306244,
"step": 18
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5046207162110128,
"calib/avg_num_step_conf": 4.73828125,
"calib/ece": 0.2918577075098815,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2727272727272727,
"calib/gap": 0.009512257733282015,
"calib/mean_conf": 0.8728853754940713,
"calib/mu_c": 0.8768707482993198,
"calib/mu_w": 0.8673584905660378,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2918577075098815,
"calib/std_conf": 0.06021022708978,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7552691218130312,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.027576814120723547,
"calib/step_q_w": 0.7276923076923076,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 468.10546875,
"completions/mean_terminated_length": 468.10546875,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.007622662466019392,
"kl": 0.0032324790954589844,
"learning_rate": 4.75e-06,
"loss": 0.031,
"num_tokens": 4495485.0,
"reward": 0.5108803510665894,
"reward_std": 0.14525091648101807,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6640562415122986,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.04598575830459595,
"step": 19
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4476190476190476,
"calib/avg_num_step_conf": 5.73828125,
"calib/ece": 0.28889763779527555,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2559055118110236,
"calib/gap": -0.007488654522211702,
"calib/mean_conf": 0.8755118110236221,
"calib/mu_c": 0.8724161073825502,
"calib/mu_w": 0.8799047619047619,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.28889763779527555,
"calib/std_conf": 0.04754125703367803,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7442355889724311,
"calib/step_q_c_n": 798.0,
"calib/step_q_gap": 0.018795946647542916,
"calib/step_q_w": 0.7254396423248882,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2643.0,
"completions/max_terminated_length": 2643.0,
"completions/mean_length": 471.95703125,
"completions/mean_terminated_length": 471.95703125,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.013368158601224422,
"kl": 0.0924372673034668,
"learning_rate": 5e-06,
"loss": 0.0014,
"num_tokens": 4721178.0,
"reward": 0.5101369023323059,
"reward_std": 0.12661463022232056,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6629281044006348,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.042501889169216156,
"step": 20
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.45863618342804285,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.2530980392156864,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2784313725490196,
"calib/gap": -0.011480111008325578,
"calib/mean_conf": 0.8746666666666667,
"calib/mu_c": 0.8704347826086956,
"calib/mu_w": 0.8819148936170211,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24819607843137265,
"calib/std_conf": 0.05550292932268253,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7002188940092166,
"calib/step_q_c_n": 868.0,
"calib/step_q_gap": -0.024235331342896127,
"calib/step_q_w": 0.7244542253521127,
"calib/step_q_w_n": 568.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1558.0,
"completions/max_terminated_length": 1558.0,
"completions/mean_length": 475.74609375,
"completions/mean_terminated_length": 477.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.0224,
"grad_norm": 0.008174967020750046,
"kl": 0.006114959716796875,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0049,
"num_tokens": 4945929.0,
"reward": 0.5330367684364319,
"reward_std": 0.17249098420143127,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6969093680381775,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.044164177030324936,
"step": 21
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5389050901378578,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.24429687500000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.27734375,
"calib/gap": 0.0049257688229055185,
"calib/mean_conf": 0.8755468750000001,
"calib/mu_c": 0.8773170731707315,
"calib/mu_w": 0.872391304347826,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.239609375,
"calib/std_conf": 0.046825950900482256,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7113409563409563,
"calib/step_q_c_n": 962.0,
"calib/step_q_gap": 0.029228806808246133,
"calib/step_q_w": 0.6821121495327102,
"calib/step_q_w_n": 535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1163.0,
"completions/max_terminated_length": 1163.0,
"completions/mean_length": 452.40625,
"completions/mean_terminated_length": 454.180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.008732265792787075,
"kl": 0.01003265380859375,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0148,
"num_tokens": 5163561.0,
"reward": 0.5561990141868591,
"reward_std": 0.1382788121700287,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.714662492275238,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.06961052119731903,
"step": 22
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5421323894684551,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.34881889763779533,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.23228346456692914,
"calib/gap": 0.008319672131147593,
"calib/mean_conf": 0.868503937007874,
"calib/mu_c": 0.8724999999999999,
"calib/mu_w": 0.8641803278688523,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34881889763779533,
"calib/std_conf": 0.053212968938409426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6867546174142481,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.005773376433007238,
"calib/step_q_w": 0.6809812409812409,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1985.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 502.671875,
"completions/mean_terminated_length": 502.671875,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.0077652414329349995,
"kl": 0.012403488159179688,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0086,
"num_tokens": 5396181.0,
"reward": 0.4989694654941559,
"reward_std": 0.19842839241027832,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6251125335693359,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.07126393914222717,
"step": 23
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5088501334027461,
"calib/avg_num_step_conf": 6.88671875,
"calib/ece": 0.357741935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.3024193548387097,
"calib/gap": 0.008238433005791745,
"calib/mean_conf": 0.8698387096774195,
"calib/mu_c": 0.8738582677165355,
"calib/mu_w": 0.8656198347107438,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.357741935483871,
"calib/std_conf": 0.07052206096113496,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6419435028248588,
"calib/step_q_c_n": 885.0,
"calib/step_q_gap": 0.01714851421438046,
"calib/step_q_w": 0.6247949886104783,
"calib/step_q_w_n": 878.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2530.0,
"completions/max_terminated_length": 2530.0,
"completions/mean_length": 575.76171875,
"completions/mean_terminated_length": 578.0196533203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0256,
"grad_norm": 0.007695337291806936,
"kl": 0.018037796020507812,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0504,
"num_tokens": 5648088.0,
"reward": 0.48372983932495117,
"reward_std": 0.21942198276519775,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6018944978713989,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.07259640097618103,
"step": 24
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5281599588530281,
"calib/avg_num_step_conf": 6.44921875,
"calib/ece": 0.2767843137254902,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.32941176470588235,
"calib/gap": 0.008054519737688337,
"calib/mean_conf": 0.8807058823529412,
"calib/mu_c": 0.883896103896104,
"calib/mu_w": 0.8758415841584156,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2767843137254902,
"calib/std_conf": 0.054853564103070726,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6362807933194154,
"calib/step_q_c_n": 958.0,
"calib/step_q_gap": 0.0004799275185496388,
"calib/step_q_w": 0.6358008658008658,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 513.18359375,
"completions/mean_terminated_length": 513.18359375,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.007971785962581635,
"kl": 0.022886276245117188,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0247,
"num_tokens": 5882687.0,
"reward": 0.549470067024231,
"reward_std": 0.1518770456314087,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6811434030532837,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.0990467220544815,
"step": 25
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47485461274121066,
"calib/avg_num_step_conf": 6.02734375,
"calib/ece": 0.26877470355731214,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": -0.001648162833730038,
"calib/mean_conf": 0.8853754940711462,
"calib/mu_c": 0.8847435897435896,
"calib/mu_w": 0.8863917525773196,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26877470355731214,
"calib/std_conf": 0.04970622502840137,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6674917127071823,
"calib/step_q_c_n": 905.0,
"calib/step_q_gap": -0.0069596979511249435,
"calib/step_q_w": 0.6744514106583073,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2619.0,
"completions/max_terminated_length": 2619.0,
"completions/mean_length": 510.875,
"completions/mean_terminated_length": 512.8784790039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.008650614880025387,
"kl": 0.029087066650390625,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0388,
"num_tokens": 6118711.0,
"reward": 0.5607102513313293,
"reward_std": 0.14517641067504883,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6800421476364136,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.12184715270996094,
"step": 26
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5288062015503876,
"calib/avg_num_step_conf": 7.20703125,
"calib/ece": 0.3875590551181102,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.29133858267716534,
"calib/gap": 0.008540775193798589,
"calib/mean_conf": 0.8711023622047245,
"calib/mu_c": 0.8754400000000001,
"calib/mu_w": 0.8668992248062015,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.383267716535433,
"calib/std_conf": 0.08867554187431534,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6481046511627906,
"calib/step_q_c_n": 860.0,
"calib/step_q_gap": 0.037434600401369256,
"calib/step_q_w": 0.6106700507614213,
"calib/step_q_w_n": 985.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 538.12109375,
"completions/mean_terminated_length": 540.2313842773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0288,
"grad_norm": 0.007840660400688648,
"kl": 0.038330078125,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0064,
"num_tokens": 6361686.0,
"reward": 0.5033316016197205,
"reward_std": 0.20111700892448425,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5981351137161255,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.11243432015180588,
"step": 27
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5401933900995816,
"calib/avg_num_step_conf": 6.5625,
"calib/ece": 0.21661354581673303,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.350597609561753,
"calib/gap": 0.013213306393418889,
"calib/mean_conf": 0.8782868525896413,
"calib/mu_c": 0.882603550295858,
"calib/mu_w": 0.8693902439024391,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21079681274900394,
"calib/std_conf": 0.08362226172396235,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6238046511627907,
"calib/step_q_c_n": 1075.0,
"calib/step_q_gap": 0.028779857774360984,
"calib/step_q_w": 0.5950247933884297,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2903.0,
"completions/max_terminated_length": 2903.0,
"completions/mean_length": 580.86328125,
"completions/mean_terminated_length": 583.1412353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.007725785952061415,
"kl": 0.03627777099609375,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0077,
"num_tokens": 6617331.0,
"reward": 0.5768232941627502,
"reward_std": 0.19204816222190857,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7224472761154175,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.10229302942752838,
"step": 28
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5165031222123104,
"calib/avg_num_step_conf": 7.30859375,
"calib/ece": 0.36490039840637445,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3904382470119522,
"calib/gap": 0.0016420288008153205,
"calib/mean_conf": 0.8923107569721116,
"calib/mu_c": 0.8930827067669171,
"calib/mu_w": 0.8914406779661018,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3636653386454183,
"calib/std_conf": 0.05237239323852384,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6024052004333694,
"calib/step_q_c_n": 923.0,
"calib/step_q_gap": 0.03731026372450874,
"calib/step_q_w": 0.5650949367088607,
"calib/step_q_w_n": 948.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2982.0,
"completions/max_terminated_length": 2982.0,
"completions/mean_length": 651.48828125,
"completions/mean_terminated_length": 651.48828125,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.006872882600873709,
"kl": 0.035243988037109375,
"learning_rate": 4.75e-06,
"loss": 0.0606,
"num_tokens": 6891240.0,
"reward": 0.5300413966178894,
"reward_std": 0.1835039108991623,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6055496335029602,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.15453320741653442,
"step": 29
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4484031132581857,
"calib/avg_num_step_conf": 6.85546875,
"calib/ece": 0.32394308943089434,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3170731707317073,
"calib/gap": -0.00435587761674705,
"calib/mean_conf": 0.8779268292682927,
"calib/mu_c": 0.8760144927536232,
"calib/mu_w": 0.8803703703703702,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3204471544715448,
"calib/std_conf": 0.085953350056187,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5931235697940505,
"calib/step_q_c_n": 874.0,
"calib/step_q_gap": 0.03481482972594607,
"calib/step_q_w": 0.5583087400681044,
"calib/step_q_w_n": 881.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2445.0,
"completions/max_terminated_length": 2445.0,
"completions/mean_length": 614.6875,
"completions/mean_terminated_length": 624.4444580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.032,
"grad_norm": 0.006772920954972506,
"kl": 0.038608551025390625,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0047,
"num_tokens": 7155584.0,
"reward": 0.542879581451416,
"reward_std": 0.2689790725708008,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6185808181762695,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.16717834770679474,
"step": 30
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5401891252955082,
"calib/avg_num_step_conf": 7.625,
"calib/ece": 0.4343650793650794,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.38095238095238093,
"calib/gap": 0.031278512555108495,
"calib/mean_conf": 0.8748412698412698,
"calib/mu_c": 0.8923423423423424,
"calib/mu_w": 0.861063829787234,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4343650793650794,
"calib/std_conf": 0.11757965302184178,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5797603195739014,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.029137505252502582,
"calib/step_q_w": 0.5506228143213988,
"calib/step_q_w_n": 1201.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2770.0,
"completions/max_terminated_length": 2770.0,
"completions/mean_length": 630.57421875,
"completions/mean_terminated_length": 633.047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.006630671210587025,
"kl": 0.04232025146484375,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0466,
"num_tokens": 7422923.0,
"reward": 0.5082546472549438,
"reward_std": 0.21992552280426025,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.5576117038726807,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.17530381679534912,
"step": 31
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5533968090581575,
"calib/avg_num_step_conf": 7.0859375,
"calib/ece": 0.3418400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.356,
"calib/gap": 0.021060216160576406,
"calib/mean_conf": 0.8778400000000001,
"calib/mu_c": 0.8876119402985074,
"calib/mu_w": 0.866551724137931,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3418400000000001,
"calib/std_conf": 0.08215189833473113,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5951980324074073,
"calib/step_q_c_n": 864.0,
"calib/step_q_gap": 0.03201908503898632,
"calib/step_q_w": 0.563178947368421,
"calib/step_q_w_n": 950.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 590.328125,
"completions/mean_terminated_length": 597.3280639648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.007396162953227758,
"kl": 0.0458984375,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0152,
"num_tokens": 7680751.0,
"reward": 0.5492283701896667,
"reward_std": 0.2305486500263214,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6220609545707703,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.1771770715713501,
"step": 32
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5556763810352328,
"calib/avg_num_step_conf": 6.765625,
"calib/ece": 0.40216535433070866,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.33858267716535434,
"calib/gap": 0.013349282296650666,
"calib/mean_conf": 0.878464566929134,
"calib/mu_c": 0.8854545454545455,
"calib/mu_w": 0.8721052631578948,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4021259842519685,
"calib/std_conf": 0.06160904106823838,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5805651105651106,
"calib/step_q_c_n": 814.0,
"calib/step_q_gap": 0.0001729537023654748,
"calib/step_q_w": 0.5803921568627451,
"calib/step_q_w_n": 918.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2244.0,
"completions/max_terminated_length": 2244.0,
"completions/mean_length": 577.65234375,
"completions/mean_terminated_length": 579.9176635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.0352,
"grad_norm": 0.006979378871619701,
"kl": 0.048908233642578125,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0604,
"num_tokens": 7935502.0,
"reward": 0.5247606039047241,
"reward_std": 0.21103210747241974,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.5871254205703735,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.16942700743675232,
"step": 33
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4795370890261401,
"calib/avg_num_step_conf": 6.77734375,
"calib/ece": 0.3318503937007874,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.29133858267716534,
"calib/gap": -0.0013818703599725035,
"calib/mean_conf": 0.8712204724409448,
"calib/mu_c": 0.8705839416058394,
"calib/mu_w": 0.8719658119658119,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3318503937007874,
"calib/std_conf": 0.06628860941639579,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5580782918149466,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.03725990616472241,
"calib/step_q_w": 0.5208183856502242,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3059.0,
"completions/max_terminated_length": 3059.0,
"completions/mean_length": 522.265625,
"completions/mean_terminated_length": 524.3137817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.010433186776936054,
"kl": 0.05446624755859375,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0187,
"num_tokens": 8174314.0,
"reward": 0.5656172037124634,
"reward_std": 0.20620962977409363,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6313730478286743,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.194392591714859,
"step": 34
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5385248496359607,
"calib/avg_num_step_conf": 6.125,
"calib/ece": 0.30519841269841275,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.20238095238095238,
"calib/gap": 0.024358974358974494,
"calib/mean_conf": 0.8409126984126986,
"calib/mu_c": 0.8522222222222223,
"calib/mu_w": 0.8278632478632478,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30519841269841275,
"calib/std_conf": 0.10666834223693417,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.538297619047619,
"calib/step_q_c_n": 840.0,
"calib/step_q_gap": 0.006965201465201409,
"calib/step_q_w": 0.5313324175824176,
"calib/step_q_w_n": 728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2786.0,
"completions/max_terminated_length": 2786.0,
"completions/mean_length": 603.01171875,
"completions/mean_terminated_length": 607.7598266601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 243.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.007517626043409109,
"kl": 0.044712066650390625,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0204,
"num_tokens": 8437941.0,
"reward": 0.5808595418930054,
"reward_std": 0.2326761782169342,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.646169126033783,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.21398743987083435,
"step": 35
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5064760508308895,
"calib/avg_num_step_conf": 6.26953125,
"calib/ece": 0.12333333333333321,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.16666666666666666,
"calib/gap": -0.007086999022482887,
"calib/mean_conf": 0.8449206349206348,
"calib/mu_c": 0.8430645161290322,
"calib/mu_w": 0.8501515151515151,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11507936507936498,
"calib/std_conf": 0.08589164484142289,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5507913043478261,
"calib/step_q_c_n": 1150.0,
"calib/step_q_gap": 0.026351743908265668,
"calib/step_q_w": 0.5244395604395604,
"calib/step_q_w_n": 455.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2036.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 530.5859375,
"completions/mean_terminated_length": 532.6666870117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.0384,
"grad_norm": 0.007462262641638517,
"kl": 0.0557708740234375,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0302,
"num_tokens": 8676483.0,
"reward": 0.6908488273620605,
"reward_std": 0.18387025594711304,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.7709254026412964,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2701471447944641,
"step": 36
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5613636363636365,
"calib/avg_num_step_conf": 6.5390625,
"calib/ece": 0.3771370967741935,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0846774193548387,
"calib/gap": 0.009750988142292472,
"calib/mean_conf": 0.8058467741935483,
"calib/mu_c": 0.8112727272727271,
"calib/mu_w": 0.8015217391304347,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36971774193548385,
"calib/std_conf": 0.10765128472293026,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5435054773082941,
"calib/step_q_c_n": 639.0,
"calib/step_q_gap": 0.10539919711505746,
"calib/step_q_w": 0.43810628019323666,
"calib/step_q_w_n": 1035.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2192.0,
"completions/max_terminated_length": 2192.0,
"completions/mean_length": 550.25390625,
"completions/mean_terminated_length": 556.7786865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.007464609574526548,
"kl": 0.054592132568359375,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0323,
"num_tokens": 8924444.0,
"reward": 0.5121106505393982,
"reward_std": 0.1716907024383545,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.5959277153015137,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.14860600233078003,
"step": 37
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5289828089120587,
"calib/avg_num_step_conf": 6.1640625,
"calib/ece": 0.26905511811023625,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.05905511811023622,
"calib/gap": 0.013293613852169095,
"calib/mean_conf": 0.7641732283464567,
"calib/mu_c": 0.7706106870229008,
"calib/mu_w": 0.7573170731707317,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.258740157480315,
"calib/std_conf": 0.1267115326701758,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49724181360201514,
"calib/step_q_c_n": 794.0,
"calib/step_q_gap": 0.03290507890813765,
"calib/step_q_w": 0.4643367346938775,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1547.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 503.859375,
"completions/mean_terminated_length": 507.8267822265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.0074347201734781265,
"kl": 0.05693817138671875,
"learning_rate": 4.5e-06,
"loss": -0.0008,
"num_tokens": 9160320.0,
"reward": 0.5768137574195862,
"reward_std": 0.18113070726394653,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.673811674118042,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.1790345162153244,
"step": 38
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4705807680101555,
"calib/avg_num_step_conf": 6.109375,
"calib/ece": 0.26968253968253975,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.04365079365079365,
"calib/gap": -0.014816883529038316,
"calib/mean_conf": 0.746031746031746,
"calib/mu_c": 0.7392700729927008,
"calib/mu_w": 0.7540869565217391,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23603174603174615,
"calib/std_conf": 0.15515777615100693,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48040342298288513,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.024183181696021905,
"calib/step_q_w": 0.4562202412868632,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2499.0,
"completions/max_terminated_length": 2499.0,
"completions/mean_length": 525.46875,
"completions/mean_terminated_length": 531.6996459960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0416,
"grad_norm": 0.006516232155263424,
"kl": 0.05139923095703125,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0296,
"num_tokens": 9400928.0,
"reward": 0.5995521545410156,
"reward_std": 0.1989758312702179,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6689039468765259,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2262941598892212,
"step": 39
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5063053641732284,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.21466666666666673,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.011764705882352941,
"calib/gap": 0.016117125984251968,
"calib/mean_conf": 0.6949019607843137,
"calib/mu_c": 0.702992125984252,
"calib/mu_w": 0.686875,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20576470588235302,
"calib/std_conf": 0.15402027558173298,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47658192090395485,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": 0.03949752192697281,
"calib/step_q_w": 0.43708439897698204,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2304.0,
"completions/max_terminated_length": 2304.0,
"completions/mean_length": 556.90234375,
"completions/mean_terminated_length": 556.90234375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.08801660686731339,
"kl": 0.13840866088867188,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0039,
"num_tokens": 9650255.0,
"reward": 0.5951794385910034,
"reward_std": 0.1751280426979065,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6928679347038269,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.19905337691307068,
"step": 40
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4886706258565554,
"calib/avg_num_step_conf": 5.82421875,
"calib/ece": 0.1355905511811023,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.01968503937007874,
"calib/gap": -0.0021306532663315503,
"calib/mean_conf": 0.6843307086614173,
"calib/mu_c": 0.6838693467336684,
"calib/mu_w": 0.6859999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.018228346456692867,
"calib/std_conf": 0.15763053858555895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43566401746724887,
"calib/step_q_c_n": 1145.0,
"calib/step_q_gap": 0.013929913421006024,
"calib/step_q_w": 0.42173410404624284,
"calib/step_q_w_n": 346.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 482.26171875,
"completions/mean_terminated_length": 482.26171875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.006632791832089424,
"kl": 0.06319427490234375,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0212,
"num_tokens": 9880962.0,
"reward": 0.7019416093826294,
"reward_std": 0.1809065341949463,
"rewards/accuracy_reward_step": 0.77734375,
"rewards/final_brier_reward_step": 0.7887437343597412,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.26123327016830444,
"step": 41
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6127613462519124,
"calib/avg_num_step_conf": 5.73046875,
"calib/ece": 0.08677165354330707,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": 0.06857470678225386,
"calib/mean_conf": 0.6425984251968504,
"calib/mu_c": 0.6712162162162162,
"calib/mu_w": 0.6026415094339623,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07334645669291337,
"calib/std_conf": 0.16609556398493233,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44271764705882355,
"calib/step_q_c_n": 850.0,
"calib/step_q_gap": 0.052199008485079634,
"calib/step_q_w": 0.3905186385737439,
"calib/step_q_w_n": 617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2762.0,
"completions/max_terminated_length": 2762.0,
"completions/mean_length": 450.9921875,
"completions/mean_terminated_length": 450.9921875,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0448,
"grad_norm": 0.007387762889266014,
"kl": 0.06370925903320312,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0156,
"num_tokens": 10100784.0,
"reward": 0.642842173576355,
"reward_std": 0.16730615496635437,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7530773878097534,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.21854455769062042,
"step": 42
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5963322884012539,
"calib/avg_num_step_conf": 5.73046875,
"calib/ece": 0.08086274509803923,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.07805015673981208,
"calib/mean_conf": 0.6107450980392156,
"calib/mu_c": 0.6444137931034484,
"calib/mu_w": 0.5663636363636363,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0614901960784314,
"calib/std_conf": 0.1649379970416994,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4295169082125604,
"calib/step_q_c_n": 828.0,
"calib/step_q_gap": 0.05078451384636323,
"calib/step_q_w": 0.37873239436619716,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1838.0,
"completions/max_terminated_length": 1838.0,
"completions/mean_length": 497.91015625,
"completions/mean_terminated_length": 499.8627624511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.006303076166659594,
"kl": 0.05922698974609375,
"learning_rate": 4.361111111111112e-06,
"loss": 0.023,
"num_tokens": 10333473.0,
"reward": 0.6634681820869446,
"reward_std": 0.1730535626411438,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.76103675365448,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.2526184022426605,
"step": 43
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5205368793885541,
"calib/avg_num_step_conf": 6.27734375,
"calib/ece": 0.17744094488188975,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": 0.020358540980550566,
"calib/mean_conf": 0.6124803149606299,
"calib/mu_c": 0.6231404958677687,
"calib/mu_w": 0.6027819548872181,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15677165354330705,
"calib/std_conf": 0.18767472071006683,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41140000000000004,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.037411428571428595,
"calib/step_q_w": 0.37398857142857145,
"calib/step_q_w_n": 875.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1799.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 524.71484375,
"completions/mean_terminated_length": 526.7725830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.006215093191713095,
"kl": 0.05739593505859375,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0134,
"num_tokens": 10574120.0,
"reward": 0.5865083336830139,
"reward_std": 0.17506547272205353,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7014456987380981,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.1786021888256073,
"step": 44
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6223754110801922,
"calib/avg_num_step_conf": 6.04296875,
"calib/ece": 0.09706349206349207,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.015873015873015872,
"calib/gap": 0.08617252719453583,
"calib/mean_conf": 0.5691269841269841,
"calib/mu_c": 0.6094776119402986,
"calib/mu_w": 0.5233050847457628,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06722222222222222,
"calib/std_conf": 0.19493393199565195,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42671205846528626,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.05219236149558931,
"calib/step_q_w": 0.37451969696969695,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2819.0,
"completions/max_terminated_length": 2819.0,
"completions/mean_length": 490.828125,
"completions/mean_terminated_length": 490.828125,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.048,
"grad_norm": 0.005764865782111883,
"kl": 0.06572723388671875,
"learning_rate": 4.305555555555556e-06,
"loss": 0.0445,
"num_tokens": 10804820.0,
"reward": 0.6049693822860718,
"reward_std": 0.17337967455387115,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7427343726158142,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.1656419336795807,
"step": 45
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4638007572790182,
"calib/avg_num_step_conf": 6.61328125,
"calib/ece": 0.1714457831325301,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.020080321285140562,
"calib/gap": -0.017647865256560813,
"calib/mean_conf": 0.5830120481927712,
"calib/mu_c": 0.5751449275362319,
"calib/mu_w": 0.5927927927927927,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10012048192771085,
"calib/std_conf": 0.18503707393477412,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37555155875299767,
"calib/step_q_c_n": 834.0,
"calib/step_q_gap": 0.030021873071973293,
"calib/step_q_w": 0.3455296856810244,
"calib/step_q_w_n": 859.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2927.0,
"completions/max_terminated_length": 2927.0,
"completions/mean_length": 530.265625,
"completions/mean_terminated_length": 530.265625,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.005777245853096247,
"kl": 0.0604095458984375,
"learning_rate": 4.277777777777778e-06,
"loss": 0.0352,
"num_tokens": 11045336.0,
"reward": 0.5770740509033203,
"reward_std": 0.1873425543308258,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6897605657577515,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.16204379498958588,
"step": 46
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5565924142083082,
"calib/avg_num_step_conf": 6.47265625,
"calib/ece": 0.13240000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.02,
"calib/gap": 0.060226102080406574,
"calib/mean_conf": 0.59072,
"calib/mu_c": 0.6145695364238409,
"calib/mu_w": 0.5543434343434344,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.059560000000000016,
"calib/std_conf": 0.19188194704036127,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39741200828157347,
"calib/step_q_c_n": 966.0,
"calib/step_q_gap": 0.0360314004668123,
"calib/step_q_w": 0.36138060781476117,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 524.9609375,
"completions/mean_terminated_length": 529.094482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.0055895112454891205,
"kl": 0.06354522705078125,
"learning_rate": 4.25e-06,
"loss": 0.0356,
"num_tokens": 11285702.0,
"reward": 0.6485556364059448,
"reward_std": 0.1632830947637558,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7349914312362671,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.24883851408958435,
"step": 47
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5178482587064677,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.13555118110236217,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.023622047244094488,
"calib/gap": 0.031546019900497435,
"calib/mean_conf": 0.5625590551181102,
"calib/mu_c": 0.5774626865671642,
"calib/mu_w": 0.5459166666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08527559055118106,
"calib/std_conf": 0.18684741147233042,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42755827338129493,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.05981382893685039,
"calib/step_q_w": 0.36774444444444454,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2358.0,
"completions/max_terminated_length": 2358.0,
"completions/mean_length": 471.73828125,
"completions/mean_terminated_length": 471.73828125,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0512,
"grad_norm": 0.005875094328075647,
"kl": 0.07415771484375,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0597,
"num_tokens": 11510155.0,
"reward": 0.59670490026474,
"reward_std": 0.19118118286132812,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7246417999267578,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.16564300656318665,
"step": 48
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.509004270066217,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.14707031250000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.015625,
"calib/gap": 0.009915836376013454,
"calib/mean_conf": 0.5952734375000001,
"calib/mu_c": 0.5996503496503497,
"calib/mu_w": 0.5897345132743362,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09187500000000001,
"calib/std_conf": 0.1655339473701198,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3988808664259928,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.007707934095165736,
"calib/step_q_w": 0.39117293233082706,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1834.0,
"completions/max_terminated_length": 1834.0,
"completions/mean_length": 464.66015625,
"completions/mean_terminated_length": 466.4823913574219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.005934995133429766,
"kl": 0.072479248046875,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0048,
"num_tokens": 11733644.0,
"reward": 0.6388311982154846,
"reward_std": 0.15334394574165344,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7295762300491333,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.23636746406555176,
"step": 49
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5598329635912828,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.09909803921568625,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.03743051024402966,
"calib/mean_conf": 0.6024705882352941,
"calib/mu_c": 0.6167088607594936,
"calib/mu_w": 0.579278350515464,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.04098039215686272,
"calib/std_conf": 0.1625114599809631,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43825740318906603,
"calib/step_q_c_n": 878.0,
"calib/step_q_gap": 0.026513138245608803,
"calib/step_q_w": 0.4117442649434572,
"calib/step_q_w_n": 619.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1448.0,
"completions/max_terminated_length": 1448.0,
"completions/mean_length": 478.62890625,
"completions/mean_terminated_length": 480.50592041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.006061997264623642,
"kl": 0.0747833251953125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0099,
"num_tokens": 11961533.0,
"reward": 0.6497402191162109,
"reward_std": 0.186766117811203,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7475171685218811,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.23086941242218018,
"step": 50
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4809847748623259,
"calib/avg_num_step_conf": 5.921875,
"calib/ece": 0.12714285714285706,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.01984126984126984,
"calib/gap": 0.016517006802721057,
"calib/mean_conf": 0.6237301587301588,
"calib/mu_c": 0.6306122448979592,
"calib/mu_w": 0.6140952380952381,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08376984126984119,
"calib/std_conf": 0.168413390331149,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42119133574007217,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.03764296591038846,
"calib/step_q_w": 0.3835483698296837,
"calib/step_q_w_n": 685.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2825.0,
"completions/max_terminated_length": 2825.0,
"completions/mean_length": 529.2421875,
"completions/mean_terminated_length": 529.2421875,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.0544,
"grad_norm": 0.00549277663230896,
"kl": 0.06853485107421875,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0231,
"num_tokens": 12206315.0,
"reward": 0.6381635665893555,
"reward_std": 0.18898846209049225,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7234945297241211,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.24033261835575104,
"step": 51
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5987903225806452,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.15220472440944885,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.023622047244094488,
"calib/gap": 0.08058823529411774,
"calib/mean_conf": 0.5884251968503939,
"calib/mu_c": 0.6100000000000001,
"calib/mu_w": 0.5294117647058824,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.004173228346456682,
"calib/std_conf": 0.18439056971484308,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41865711556829033,
"calib/step_q_c_n": 1047.0,
"calib/step_q_gap": -0.0016654650768709311,
"calib/step_q_w": 0.42032258064516126,
"calib/step_q_w_n": 372.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1337.0,
"completions/max_terminated_length": 1337.0,
"completions/mean_length": 458.71484375,
"completions/mean_terminated_length": 460.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.006017228588461876,
"kl": 0.0811004638671875,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0145,
"num_tokens": 12431698.0,
"reward": 0.721343994140625,
"reward_std": 0.17622464895248413,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7716425657272339,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.32729530334472656,
"step": 52
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.603898966994205,
"calib/avg_num_step_conf": 6.33203125,
"calib/ece": 0.08941176470588239,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.06378117913832193,
"calib/mean_conf": 0.5948235294117648,
"calib/mu_c": 0.6218367346938776,
"calib/mu_w": 0.5580555555555556,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05388235294117648,
"calib/std_conf": 0.168236144245761,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4338461538461539,
"calib/step_q_c_n": 962.0,
"calib/step_q_gap": 0.051437200887125056,
"calib/step_q_w": 0.3824089529590288,
"calib/step_q_w_n": 659.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2627.0,
"completions/max_terminated_length": 2627.0,
"completions/mean_length": 517.9453125,
"completions/mean_terminated_length": 517.9453125,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.0055801658891141415,
"kl": 0.07416534423828125,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0473,
"num_tokens": 12670116.0,
"reward": 0.6302124857902527,
"reward_std": 0.16622228920459747,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7553898096084595,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.19097262620925903,
"step": 53
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.591922388881244,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.10635294117647061,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.043137254901960784,
"calib/gap": 0.061826063024632005,
"calib/mean_conf": 0.6305098039215686,
"calib/mu_c": 0.6513609467455621,
"calib/mu_w": 0.5895348837209301,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.037058823529411755,
"calib/std_conf": 0.16956499968911448,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4601995515695067,
"calib/step_q_c_n": 892.0,
"calib/step_q_gap": 0.05714493763353684,
"calib/step_q_w": 0.40305461393596986,
"calib/step_q_w_n": 531.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2176.0,
"completions/max_terminated_length": 2176.0,
"completions/mean_length": 460.70703125,
"completions/mean_terminated_length": 460.70703125,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.0576,
"grad_norm": 0.006231546867638826,
"kl": 0.08040618896484375,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0117,
"num_tokens": 12894289.0,
"reward": 0.6591579914093018,
"reward_std": 0.15599699318408966,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7713078260421753,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.21575812995433807,
"step": 54
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5949283559577677,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.12999999999999998,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.039525691699604744,
"calib/gap": 0.0689335093011566,
"calib/mean_conf": 0.6536363636363637,
"calib/mu_c": 0.685514705882353,
"calib/mu_w": 0.6165811965811964,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12304347826086956,
"calib/std_conf": 0.17501501398558267,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45723473541383985,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.006887030267803684,
"calib/step_q_w": 0.45034770514603617,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 529.046875,
"completions/mean_terminated_length": 529.046875,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.00565149262547493,
"kl": 0.08214569091796875,
"learning_rate": 4.027777777777779e-06,
"loss": 0.0722,
"num_tokens": 13137549.0,
"reward": 0.6621849536895752,
"reward_std": 0.2255595624446869,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7321434020996094,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.28910163044929504,
"step": 55
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5978233830845772,
"calib/avg_num_step_conf": 6.20703125,
"calib/ece": 0.2053543307086614,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.031496062992125984,
"calib/gap": 0.050671641791044775,
"calib/mean_conf": 0.6682677165354332,
"calib/mu_c": 0.6950000000000001,
"calib/mu_w": 0.6443283582089553,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20059055118110233,
"calib/std_conf": 0.14187215320638408,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4754463130659767,
"calib/step_q_c_n": 773.0,
"calib/step_q_gap": 0.018247783654212035,
"calib/step_q_w": 0.45719852941176464,
"calib/step_q_w_n": 816.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1854.0,
"completions/max_terminated_length": 1854.0,
"completions/mean_length": 531.5546875,
"completions/mean_terminated_length": 533.6392211914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.005589254200458527,
"kl": 0.071380615234375,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0061,
"num_tokens": 13380467.0,
"reward": 0.6137272119522095,
"reward_std": 0.22647228837013245,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7119367122650146,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2225489318370819,
"step": 56
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5939275568181819,
"calib/avg_num_step_conf": 6.32421875,
"calib/ece": 0.06612903225806455,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.03225806451612903,
"calib/gap": 0.05914204545454538,
"calib/mean_conf": 0.6889516129032257,
"calib/mu_c": 0.7099375,
"calib/mu_w": 0.6507954545454546,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05495967741935488,
"calib/std_conf": 0.14916335788694293,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5133693599160546,
"calib/step_q_c_n": 953.0,
"calib/step_q_gap": 0.09219818874488345,
"calib/step_q_w": 0.42117117117117114,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 543.9296875,
"completions/mean_terminated_length": 552.5635375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.0608,
"grad_norm": 0.005411534570157528,
"kl": 0.07976531982421875,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0103,
"num_tokens": 13626505.0,
"reward": 0.6742805242538452,
"reward_std": 0.19319787621498108,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7497960925102234,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.28001493215560913,
"step": 57
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5088778409090909,
"calib/avg_num_step_conf": 6.8515625,
"calib/ece": 0.18978714859437754,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0321285140562249,
"calib/gap": 0.008725271177685956,
"calib/mean_conf": 0.6730803212851405,
"calib/mu_c": 0.6773203125,
"calib/mu_w": 0.6685950413223141,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17440562248995986,
"calib/std_conf": 0.14457341200973897,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4824614443084455,
"calib/step_q_c_n": 817.0,
"calib/step_q_gap": 0.0501206403240983,
"calib/step_q_w": 0.4323408039843472,
"calib/step_q_w_n": 937.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2779.0,
"completions/max_terminated_length": 2779.0,
"completions/mean_length": 626.45703125,
"completions/mean_terminated_length": 626.45703125,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.005182043649256229,
"kl": 0.07784271240234375,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0669,
"num_tokens": 13893198.0,
"reward": 0.5870130658149719,
"reward_std": 0.23078709840774536,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6881850957870483,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.19209106266498566,
"step": 58
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47987708168120535,
"calib/avg_num_step_conf": 6.2734375,
"calib/ece": 0.16229249011857705,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.09486166007905138,
"calib/gap": -0.008536214644462081,
"calib/mean_conf": 0.726798418972332,
"calib/mu_c": 0.7235256410256411,
"calib/mu_w": 0.7320618556701032,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13624505928853756,
"calib/std_conf": 0.13775596684232014,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5421439073514602,
"calib/step_q_c_n": 993.0,
"calib/step_q_gap": 0.05318795302845858,
"calib/step_q_w": 0.48895595432300165,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2146.0,
"completions/max_terminated_length": 2146.0,
"completions/mean_length": 585.59765625,
"completions/mean_terminated_length": 585.59765625,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.005232793744653463,
"kl": 0.0828094482421875,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0239,
"num_tokens": 14149359.0,
"reward": 0.602554202079773,
"reward_std": 0.23856589198112488,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7199031114578247,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.16567403078079224,
"step": 59
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5296653144016228,
"calib/avg_num_step_conf": 6.5078125,
"calib/ece": 0.18662698412698414,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.06746031746031746,
"calib/gap": 0.03103448275862064,
"calib/mean_conf": 0.7032142857142857,
"calib/mu_c": 0.7175,
"calib/mu_w": 0.6864655172413794,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1750793650793651,
"calib/std_conf": 0.16750617648611568,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5243504171632896,
"calib/step_q_c_n": 839.0,
"calib/step_q_gap": 0.04954993348735248,
"calib/step_q_w": 0.4748004836759371,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 584.83203125,
"completions/mean_terminated_length": 584.83203125,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.064,
"grad_norm": 0.005824708379805088,
"kl": 0.08036041259765625,
"learning_rate": 3.88888888888889e-06,
"loss": 0.034,
"num_tokens": 14407932.0,
"reward": 0.6443766355514526,
"reward_std": 0.24165666103363037,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.701065182685852,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2845630347728729,
"step": 60
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.569233437391606,
"calib/avg_num_step_conf": 7.01953125,
"calib/ece": 0.13028225806451615,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0967741935483871,
"calib/gap": 0.031204301075268837,
"calib/mean_conf": 0.7464919354838709,
"calib/mu_c": 0.7581935483870967,
"calib/mu_w": 0.7269892473118279,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12588709677419357,
"calib/std_conf": 0.12576616781639505,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5683931458131264,
"calib/step_q_c_n": 1031.0,
"calib/step_q_gap": 0.07113727113949714,
"calib/step_q_w": 0.49725587467362925,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2928.0,
"completions/max_terminated_length": 2928.0,
"completions/mean_length": 526.8671875,
"completions/mean_terminated_length": 528.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.005149191245436668,
"kl": 0.09716796875,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0752,
"num_tokens": 14646874.0,
"reward": 0.6565155982971191,
"reward_std": 0.217573881149292,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7262473106384277,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.2719402015209198,
"step": 61
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5750872717508055,
"calib/avg_num_step_conf": 7.09375,
"calib/ece": 0.20253061224489796,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.08163265306122448,
"calib/gap": 0.03785714285714292,
"calib/mean_conf": 0.7369795918367347,
"calib/mu_c": 0.7542857142857143,
"calib/mu_w": 0.7164285714285714,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19832653061224492,
"calib/std_conf": 0.1401655711599996,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5544982290436835,
"calib/step_q_c_n": 847.0,
"calib/step_q_gap": 0.04672031366700646,
"calib/step_q_w": 0.507777915376677,
"calib/step_q_w_n": 969.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2984.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 603.61328125,
"completions/mean_terminated_length": 605.9804077148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.005013887770473957,
"kl": 0.088714599609375,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0404,
"num_tokens": 14908479.0,
"reward": 0.5980898141860962,
"reward_std": 0.2827584743499756,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6826468706130981,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.21822020411491394,
"step": 62
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6009721756620852,
"calib/avg_num_step_conf": 6.85546875,
"calib/ece": 0.1386904761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.11904761904761904,
"calib/gap": 0.04693865236339256,
"calib/mean_conf": 0.7537698412698413,
"calib/mu_c": 0.7714649681528662,
"calib/mu_w": 0.7245263157894737,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1347222222222222,
"calib/std_conf": 0.13496499620850624,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5428584905660377,
"calib/step_q_c_n": 1060.0,
"calib/step_q_gap": 0.022987986968915464,
"calib/step_q_w": 0.5198705035971223,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2199.0,
"completions/max_terminated_length": 2199.0,
"completions/mean_length": 618.79296875,
"completions/mean_terminated_length": 621.2196655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0672,
"grad_norm": 0.016880923882126808,
"kl": 0.158935546875,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0382,
"num_tokens": 15175530.0,
"reward": 0.69347083568573,
"reward_std": 0.22258234024047852,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7401214838027954,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.3272889256477356,
"step": 63
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5192307692307692,
"calib/avg_num_step_conf": 6.453125,
"calib/ece": 0.1411764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.08627450980392157,
"calib/gap": 0.027687491399477016,
"calib/mean_conf": 0.7347450980392157,
"calib/mu_c": 0.7440828402366864,
"calib/mu_w": 0.7163953488372093,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10658823529411766,
"calib/std_conf": 0.14909607944994172,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5333053221288515,
"calib/step_q_c_n": 1071.0,
"calib/step_q_gap": -0.004749755323816274,
"calib/step_q_w": 0.5380550774526678,
"calib/step_q_w_n": 581.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2278.0,
"completions/max_terminated_length": 2278.0,
"completions/mean_length": 591.609375,
"completions/mean_terminated_length": 591.609375,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.0052238283678889275,
"kl": 0.091217041015625,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0254,
"num_tokens": 15430758.0,
"reward": 0.6832898855209351,
"reward_std": 0.23457638919353485,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7575253844261169,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.27858564257621765,
"step": 64
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6231354927292019,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.21589843750000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.18359375,
"calib/gap": 0.05692254883604819,
"calib/mean_conf": 0.7901171875,
"calib/mu_c": 0.8143537414965987,
"calib/mu_w": 0.7574311926605505,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21589843750000004,
"calib/std_conf": 0.1254102094810859,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5790928495197438,
"calib/step_q_c_n": 937.0,
"calib/step_q_gap": 0.010912421385187332,
"calib/step_q_w": 0.5681804281345565,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1094.0,
"completions/max_terminated_length": 1094.0,
"completions/mean_length": 512.546875,
"completions/mean_terminated_length": 514.556884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.0056617711670696735,
"kl": 0.101287841796875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0176,
"num_tokens": 15666994.0,
"reward": 0.6725325584411621,
"reward_std": 0.1979164481163025,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7156343460083008,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.3161495625972748,
"step": 65
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5088254593175854,
"calib/avg_num_step_conf": 6.92578125,
"calib/ece": 0.30008097165991904,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.19433198380566802,
"calib/gap": 0.00526443569553825,
"calib/mean_conf": 0.7813765182186235,
"calib/mu_c": 0.7840833333333334,
"calib/mu_w": 0.7788188976377951,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2978137651821863,
"calib/std_conf": 0.1266051911588266,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5069146341463415,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.021710017147390848,
"calib/step_q_w": 0.4852046169989507,
"calib/step_q_w_n": 953.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2734.0,
"completions/max_terminated_length": 2734.0,
"completions/mean_length": 669.875,
"completions/mean_terminated_length": 672.5020141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.0704,
"grad_norm": 0.005155966151505709,
"kl": 0.078948974609375,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.018,
"num_tokens": 15944834.0,
"reward": 0.5533797740936279,
"reward_std": 0.24173156917095184,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6190582513809204,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2025451362133026,
"step": 66
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5295155709342562,
"calib/avg_num_step_conf": 6.4296875,
"calib/ece": 0.14752941176470588,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.17254901960784313,
"calib/gap": 0.012588235294117567,
"calib/mean_conf": 0.7839215686274509,
"calib/mu_c": 0.7881176470588235,
"calib/mu_w": 0.7755294117647059,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13239215686274514,
"calib/std_conf": 0.11805081045261738,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5469237832874196,
"calib/step_q_c_n": 1089.0,
"calib/step_q_gap": 0.0067981100378685655,
"calib/step_q_w": 0.5401256732495511,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1835.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 587.02734375,
"completions/mean_terminated_length": 589.3294677734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.005498452577739954,
"kl": 0.09470367431640625,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0151,
"num_tokens": 16200121.0,
"reward": 0.7042480707168579,
"reward_std": 0.19104528427124023,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7488437294960022,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.3284023404121399,
"step": 67
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5935358758888171,
"calib/avg_num_step_conf": 7.1328125,
"calib/ece": 0.2640562248995983,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.1686746987951807,
"calib/gap": 0.03897091144149967,
"calib/mean_conf": 0.786144578313253,
"calib/mu_c": 0.8047692307692308,
"calib/mu_w": 0.7657983193277311,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2640562248995983,
"calib/std_conf": 0.11682374042131591,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5402944640753827,
"calib/step_q_c_n": 849.0,
"calib/step_q_gap": 0.07223919283689756,
"calib/step_q_w": 0.46805527123848517,
"calib/step_q_w_n": 977.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2500.0,
"completions/max_terminated_length": 2500.0,
"completions/mean_length": 593.8671875,
"completions/mean_terminated_length": 598.5433349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.006504854653030634,
"kl": 0.101654052734375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0014,
"num_tokens": 16456239.0,
"reward": 0.5798185467720032,
"reward_std": 0.25953006744384766,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6659073829650879,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.19841712713241577,
"step": 68
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5633442622950819,
"calib/avg_num_step_conf": 6.3359375,
"calib/ece": 0.2683400809716599,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.15384615384615385,
"calib/gap": 0.03597704918032796,
"calib/mean_conf": 0.7541700404858298,
"calib/mu_c": 0.7723770491803279,
"calib/mu_w": 0.7363999999999999,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2642914979757085,
"calib/std_conf": 0.15699615584013946,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5594893617021277,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.0736187691903138,
"calib/step_q_w": 0.4858705925118139,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 671.21875,
"completions/mean_terminated_length": 679.1779174804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.0736,
"grad_norm": 0.00525682931765914,
"kl": 0.077056884765625,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0244,
"num_tokens": 16732567.0,
"reward": 0.5367077589035034,
"reward_std": 0.2361733615398407,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6514406204223633,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.13447493314743042,
"step": 69
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6777522935779816,
"calib/avg_num_step_conf": 6.671875,
"calib/ece": 0.2012244897959184,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.10612244897959183,
"calib/gap": 0.1016358607663248,
"calib/mean_conf": 0.756326530612245,
"calib/mu_c": 0.8015441176470588,
"calib/mu_w": 0.699908256880734,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2012244897959184,
"calib/std_conf": 0.15497062101495002,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5415924276169265,
"calib/step_q_c_n": 898.0,
"calib/step_q_gap": 0.08201218070334626,
"calib/step_q_w": 0.45958024691358024,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2505.0,
"completions/max_terminated_length": 2505.0,
"completions/mean_length": 679.6015625,
"completions/mean_terminated_length": 679.6015625,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.0051225475035607815,
"kl": 0.079864501953125,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0327,
"num_tokens": 17013537.0,
"reward": 0.6205503940582275,
"reward_std": 0.21652446687221527,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7026515603065491,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.2423553168773651,
"step": 70
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5751824817518248,
"calib/avg_num_step_conf": 6.4296875,
"calib/ece": 0.2329761904761904,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.15873015873015872,
"calib/gap": 0.04162424627102512,
"calib/mean_conf": 0.7511507936507936,
"calib/mu_c": 0.7701459854014598,
"calib/mu_w": 0.7285217391304347,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22023809523809518,
"calib/std_conf": 0.14548449802654848,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5440951276102088,
"calib/step_q_c_n": 862.0,
"calib/step_q_gap": 0.07089359699796394,
"calib/step_q_w": 0.4732015306122449,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 623.2265625,
"completions/mean_terminated_length": 623.2265625,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.0052217645570635796,
"kl": 0.08162689208984375,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0507,
"num_tokens": 17277491.0,
"reward": 0.618299126625061,
"reward_std": 0.23210731148719788,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6949383020401001,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.23931613564491272,
"step": 71
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6526579111944966,
"calib/avg_num_step_conf": 6.359375,
"calib/ece": 0.22972332015810273,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.06719367588932806,
"calib/gap": 0.062143214509068234,
"calib/mean_conf": 0.7435573122529644,
"calib/mu_c": 0.7737692307692308,
"calib/mu_w": 0.7116260162601625,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22972332015810273,
"calib/std_conf": 0.12023360692041232,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5231202046035807,
"calib/step_q_c_n": 782.0,
"calib/step_q_gap": 0.05049609112840331,
"calib/step_q_w": 0.47262411347517735,
"calib/step_q_w_n": 846.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 579.73828125,
"completions/mean_terminated_length": 579.73828125,
"completions/min_length": 259.0,
"completions/min_terminated_length": 259.0,
"epoch": 0.0768,
"grad_norm": 0.0058565582148730755,
"kl": 0.08962249755859375,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0249,
"num_tokens": 17530312.0,
"reward": 0.6693467497825623,
"reward_std": 0.21299782395362854,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7056429386138916,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.333831787109375,
"step": 72
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.603117666778411,
"calib/avg_num_step_conf": 5.9453125,
"calib/ece": 0.11738095238095247,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0873015873015873,
"calib/gap": 0.05125712370097213,
"calib/mean_conf": 0.7324603174603174,
"calib/mu_c": 0.7517834394904459,
"calib/mu_w": 0.7005263157894738,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11341269841269849,
"calib/std_conf": 0.13967471568996953,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5286980920314254,
"calib/step_q_c_n": 891.0,
"calib/step_q_gap": 0.054499993774690114,
"calib/step_q_w": 0.47419809825673526,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 569.91796875,
"completions/mean_terminated_length": 572.1529541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 258.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.005151054356247187,
"kl": 0.07952117919921875,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0296,
"num_tokens": 17783243.0,
"reward": 0.6734863519668579,
"reward_std": 0.24319618940353394,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7384711503982544,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2905329763889313,
"step": 73
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5709176788124156,
"calib/avg_num_step_conf": 6.03515625,
"calib/ece": 0.18209016393442626,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.09016393442622951,
"calib/gap": 0.0366491228070176,
"calib/mean_conf": 0.7148770491803279,
"calib/mu_c": 0.732,
"calib/mu_w": 0.6953508771929824,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.18209016393442626,
"calib/std_conf": 0.14029676262981772,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4818831168831168,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.04229602010892325,
"calib/step_q_w": 0.43958709677419355,
"calib/step_q_w_n": 775.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2871.0,
"completions/max_terminated_length": 2871.0,
"completions/mean_length": 607.51953125,
"completions/mean_terminated_length": 609.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.005812863353639841,
"kl": 0.07952880859375,
"learning_rate": 3.5e-06,
"loss": 0.0226,
"num_tokens": 18042696.0,
"reward": 0.6089438199996948,
"reward_std": 0.2539913058280945,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6791456937789917,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.24733565747737885,
"step": 74
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6744325391230791,
"calib/avg_num_step_conf": 6.2421875,
"calib/ece": 0.06035294117647065,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.10588235294117647,
"calib/gap": 0.08710700690821949,
"calib/mean_conf": 0.7347058823529413,
"calib/mu_c": 0.7627167630057804,
"calib/mu_w": 0.6756097560975609,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.058313725490196144,
"calib/std_conf": 0.13564013723136314,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4711592270531401,
"calib/step_q_c_n": 1035.0,
"calib/step_q_gap": 0.08709119863395715,
"calib/step_q_w": 0.38406802841918297,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 558.71484375,
"completions/mean_terminated_length": 558.71484375,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.08,
"grad_norm": 0.005435822065919638,
"kl": 0.0779571533203125,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0151,
"num_tokens": 18290479.0,
"reward": 0.7145378589630127,
"reward_std": 0.2187555432319641,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7951613664627075,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.29953932762145996,
"step": 75
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6992082662372516,
"calib/avg_num_step_conf": 5.2734375,
"calib/ece": 0.07870078740157484,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.08267716535433071,
"calib/gap": 0.08666129898013963,
"calib/mean_conf": 0.7011417322834644,
"calib/mu_c": 0.7325308641975309,
"calib/mu_w": 0.6458695652173913,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07102362204724412,
"calib/std_conf": 0.13246852096184245,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5043577430972389,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.06422234657886366,
"calib/step_q_w": 0.44013539651837524,
"calib/step_q_w_n": 517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1250.0,
"completions/max_terminated_length": 1250.0,
"completions/mean_length": 552.93359375,
"completions/mean_terminated_length": 555.1019897460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 243.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.0056373546831309795,
"kl": 0.0784912109375,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0158,
"num_tokens": 18535086.0,
"reward": 0.7332932949066162,
"reward_std": 0.19764763116836548,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7813144326210022,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.35949093103408813,
"step": 76
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5792428975159899,
"calib/avg_num_step_conf": 5.80078125,
"calib/ece": 0.06327935222672058,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.04048582995951417,
"calib/gap": 0.04321582626803522,
"calib/mean_conf": 0.675587044534413,
"calib/mu_c": 0.6897590361445783,
"calib/mu_w": 0.6465432098765431,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.03340080971659914,
"calib/std_conf": 0.13516656209384506,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45939362795477906,
"calib/step_q_c_n": 973.0,
"calib/step_q_gap": 0.07753815920477908,
"calib/step_q_w": 0.38185546875,
"calib/step_q_w_n": 512.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2483.0,
"completions/max_terminated_length": 2483.0,
"completions/mean_length": 583.375,
"completions/mean_terminated_length": 587.968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.005375280976295471,
"kl": 0.07251739501953125,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0405,
"num_tokens": 18789094.0,
"reward": 0.7070667743682861,
"reward_std": 0.2357901632785797,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.752937912940979,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.3377581536769867,
"step": 77
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5703867255452965,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.05604743083003952,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.023715415019762844,
"calib/gap": 0.04218988358089115,
"calib/mean_conf": 0.66300395256917,
"calib/mu_c": 0.6786792452830188,
"calib/mu_w": 0.6364893617021277,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04529644268774706,
"calib/std_conf": 0.13493252589958443,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4545704994192799,
"calib/step_q_c_n": 861.0,
"calib/step_q_gap": 0.028793606989001075,
"calib/step_q_w": 0.42577689243027883,
"calib/step_q_w_n": 502.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 613.1875,
"completions/mean_terminated_length": 613.1875,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.0832,
"grad_norm": 0.005595955532044172,
"kl": 0.071319580078125,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.038,
"num_tokens": 19054094.0,
"reward": 0.6896684169769287,
"reward_std": 0.17376267910003662,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7557179927825928,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.3025251030921936,
"step": 78
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6021731123388582,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.05882812499999987,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.01953125,
"calib/gap": 0.053334806629834364,
"calib/mean_conf": 0.672109375,
"calib/mu_c": 0.6877348066298343,
"calib/mu_w": 0.6344,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.011953124999999992,
"calib/std_conf": 0.12951925739869488,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42610837438423643,
"calib/step_q_c_n": 1015.0,
"calib/step_q_gap": -0.026398292282430236,
"calib/step_q_w": 0.45250666666666667,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1292.0,
"completions/max_terminated_length": 1292.0,
"completions/mean_length": 589.1015625,
"completions/mean_terminated_length": 591.4118041992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.005253465846180916,
"kl": 0.06707000732421875,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0086,
"num_tokens": 19311280.0,
"reward": 0.7282896041870117,
"reward_std": 0.18334491550922394,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7965078353881836,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.319446325302124,
"step": 79
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6037854191263283,
"calib/avg_num_step_conf": 5.68359375,
"calib/ece": 0.03363636363636367,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.02766798418972332,
"calib/gap": 0.038896103896103984,
"calib/mean_conf": 0.6858893280632411,
"calib/mu_c": 0.6977272727272728,
"calib/mu_w": 0.6588311688311688,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.011936758893280646,
"calib/std_conf": 0.1144936420760257,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45525229826353425,
"calib/step_q_c_n": 979.0,
"calib/step_q_gap": 0.05777330666689562,
"calib/step_q_w": 0.39747899159663863,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1753.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 523.2421875,
"completions/mean_terminated_length": 527.3621826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.0054220338352024555,
"kl": 0.08350372314453125,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0257,
"num_tokens": 19547390.0,
"reward": 0.7184160351753235,
"reward_std": 0.1986905336380005,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7822699546813965,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.3194058835506439,
"step": 80
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6122504785343177,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.05792828685258968,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.05179282868525897,
"calib/gap": 0.05364027891714529,
"calib/mean_conf": 0.6771314741035855,
"calib/mu_c": 0.6967924528301888,
"calib/mu_w": 0.6431521739130435,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05079681274900403,
"calib/std_conf": 0.1409322215857465,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43097772277227725,
"calib/step_q_c_n": 808.0,
"calib/step_q_gap": 0.03289758921635072,
"calib/step_q_w": 0.39808013355592653,
"calib/step_q_w_n": 599.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2711.0,
"completions/max_terminated_length": 2711.0,
"completions/mean_length": 577.1875,
"completions/mean_terminated_length": 584.0316162109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.0864,
"grad_norm": 0.005068330559879541,
"kl": 0.06775665283203125,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0248,
"num_tokens": 19801398.0,
"reward": 0.6715902090072632,
"reward_std": 0.2226143777370453,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7558960914611816,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2669718861579895,
"step": 81
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5481099656357388,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.11867187499999998,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.05078125,
"calib/gap": 0.043197821435518424,
"calib/mean_conf": 0.6825,
"calib/mu_c": 0.6988679245283019,
"calib/mu_w": 0.6556701030927835,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.09003906249999999,
"calib/std_conf": 0.14485984605818136,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4361780104712042,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.03580399472317275,
"calib/step_q_w": 0.40037401574803144,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1453.0,
"completions/max_terminated_length": 1453.0,
"completions/mean_length": 491.01171875,
"completions/mean_terminated_length": 492.9372863769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.006127932574599981,
"kl": 0.079132080078125,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0023,
"num_tokens": 20032649.0,
"reward": 0.6537767648696899,
"reward_std": 0.2364278882741928,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7447081804275513,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.24253278970718384,
"step": 82
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6137151137151138,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.14070866141732277,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.05905511811023622,
"calib/gap": 0.06377748377748393,
"calib/mean_conf": 0.695275590551181,
"calib/mu_c": 0.7231468531468531,
"calib/mu_w": 0.6593693693693692,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.13649606299212594,
"calib/std_conf": 0.1493704730630403,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.43745479833101525,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.07338444657222126,
"calib/step_q_w": 0.364070351758794,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1725.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 590.90625,
"completions/mean_terminated_length": 595.55908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.00493778008967638,
"kl": 0.06632232666015625,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0139,
"num_tokens": 20291185.0,
"reward": 0.6520576477050781,
"reward_std": 0.19987721741199493,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7316687107086182,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.26463404297828674,
"step": 83
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6889215686274509,
"calib/avg_num_step_conf": 4.86328125,
"calib/ece": 0.08845238095238102,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.047619047619047616,
"calib/gap": 0.10650588235294134,
"calib/mean_conf": 0.6836904761904763,
"calib/mu_c": 0.7268000000000001,
"calib/mu_w": 0.6202941176470588,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08845238095238102,
"calib/std_conf": 0.15513777615989194,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4405128205128205,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": 0.026928164428164425,
"calib/step_q_w": 0.41358465608465605,
"calib/step_q_w_n": 504.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 516.78515625,
"completions/mean_terminated_length": 518.811767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.0896,
"grad_norm": 0.005957477726042271,
"kl": 0.07054901123046875,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0353,
"num_tokens": 20529402.0,
"reward": 0.6486536264419556,
"reward_std": 0.2027164101600647,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7663355469703674,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2169092893600464,
"step": 84
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.656754161331626,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.12190476190476196,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.05952380952380952,
"calib/gap": 0.08846862996158766,
"calib/mean_conf": 0.6853968253968253,
"calib/mu_c": 0.7240140845070423,
"calib/mu_w": 0.6355454545454546,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12190476190476196,
"calib/std_conf": 0.14734502643563752,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41172610556348077,
"calib/step_q_c_n": 701.0,
"calib/step_q_gap": 0.04905776385493804,
"calib/step_q_w": 0.36266834170854273,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2457.0,
"completions/max_terminated_length": 2457.0,
"completions/mean_length": 551.07421875,
"completions/mean_terminated_length": 553.2353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.006076201796531677,
"kl": 0.07193756103515625,
"learning_rate": 3.1944444444444443e-06,
"loss": 0.0252,
"num_tokens": 20778301.0,
"reward": 0.6651642918586731,
"reward_std": 0.20453515648841858,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7460108995437622,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2772863209247589,
"step": 85
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5978315210598676,
"calib/avg_num_step_conf": 4.57421875,
"calib/ece": 0.18383399209486168,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.043478260869565216,
"calib/gap": 0.05948381452318463,
"calib/mean_conf": 0.6745849802371541,
"calib/mu_c": 0.7044444444444447,
"calib/mu_w": 0.64496062992126,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18019762845849802,
"calib/std_conf": 0.15813239840899346,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4352115059221659,
"calib/step_q_c_n": 591.0,
"calib/step_q_gap": 0.022832195577338288,
"calib/step_q_w": 0.4123793103448276,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2520.0,
"completions/max_terminated_length": 2520.0,
"completions/mean_length": 555.59765625,
"completions/mean_terminated_length": 555.59765625,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.005948134697973728,
"kl": 0.0674591064453125,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0117,
"num_tokens": 21026046.0,
"reward": 0.6285470724105835,
"reward_std": 0.1985204517841339,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7081304788589478,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2552136182785034,
"step": 86
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6512233622730861,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.08999999999999994,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.035856573705179286,
"calib/gap": 0.058494869771112734,
"calib/mean_conf": 0.6917529880478087,
"calib/mu_c": 0.7080662983425413,
"calib/mu_w": 0.6495714285714286,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.030318725099601596,
"calib/std_conf": 0.11360146815410785,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.46357231149567363,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.04696625088961304,
"calib/step_q_w": 0.4166060606060606,
"calib/step_q_w_n": 330.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2547.0,
"completions/max_terminated_length": 2547.0,
"completions/mean_length": 486.36328125,
"completions/mean_terminated_length": 488.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0928,
"grad_norm": 0.006253804080188274,
"kl": 0.07823944091796875,
"learning_rate": 3.138888888888889e-06,
"loss": 0.0307,
"num_tokens": 21256051.0,
"reward": 0.7405951023101807,
"reward_std": 0.18091410398483276,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7928581833839417,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.35083192586898804,
"step": 87
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6947640966628309,
"calib/avg_num_step_conf": 4.66015625,
"calib/ece": 0.08686274509803911,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.03137254901960784,
"calib/gap": 0.0777927215189873,
"calib/mean_conf": 0.7002745098039216,
"calib/mu_c": 0.724375,
"calib/mu_w": 0.6465822784810127,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.048470588235294064,
"calib/std_conf": 0.11513471707413059,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4650488997555012,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.021502233088834533,
"calib/step_q_w": 0.44354666666666664,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1178.0,
"completions/max_terminated_length": 1178.0,
"completions/mean_length": 500.49609375,
"completions/mean_terminated_length": 502.4588623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.006577801890671253,
"kl": 0.0890960693359375,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0028,
"num_tokens": 21494026.0,
"reward": 0.7101141810417175,
"reward_std": 0.19201169908046722,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.802936315536499,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.2805732488632202,
"step": 88
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7023994047988096,
"calib/avg_num_step_conf": 4.546875,
"calib/ece": 0.1671653543307087,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.03543307086614173,
"calib/gap": 0.10110236220472457,
"calib/mean_conf": 0.652992125984252,
"calib/mu_c": 0.7035433070866142,
"calib/mu_w": 0.6024409448818896,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16007874015748033,
"calib/std_conf": 0.1607159912991228,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4362673611111111,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.06159049036281178,
"calib/step_q_w": 0.3746768707482993,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1747.0,
"completions/max_terminated_length": 1747.0,
"completions/mean_length": 556.70703125,
"completions/mean_terminated_length": 558.8902587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.005704918876290321,
"kl": 0.07068634033203125,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.0023,
"num_tokens": 21745431.0,
"reward": 0.6401622295379639,
"reward_std": 0.18493416905403137,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7454453110694885,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2372228503227234,
"step": 89
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6815751445086705,
"calib/avg_num_step_conf": 5.15625,
"calib/ece": 0.08272727272727273,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.05533596837944664,
"calib/gap": 0.09759537572254329,
"calib/mean_conf": 0.6687351778656127,
"calib/mu_c": 0.6995953757225434,
"calib/mu_w": 0.6020000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.033833992094861674,
"calib/std_conf": 0.16450903427883745,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41314831460674156,
"calib/step_q_c_n": 890.0,
"calib/step_q_gap": 0.0036599425137182617,
"calib/step_q_w": 0.4094883720930233,
"calib/step_q_w_n": 430.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2540.0,
"completions/max_terminated_length": 2540.0,
"completions/mean_length": 503.7109375,
"completions/mean_terminated_length": 505.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.096,
"grad_norm": 0.005787891335785389,
"kl": 0.07630157470703125,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0017,
"num_tokens": 21977701.0,
"reward": 0.7060627937316895,
"reward_std": 0.18334735929965973,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7868348360061646,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.29325956106185913,
"step": 90
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7528893780957622,
"calib/avg_num_step_conf": 4.8046875,
"calib/ece": 0.09371999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.036,
"calib/gap": 0.10940423775454045,
"calib/mean_conf": 0.6686,
"calib/mu_c": 0.7088607594936709,
"calib/mu_w": 0.5994565217391304,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.06516,
"calib/std_conf": 0.13455125417475675,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43705792682926836,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.043469076655052286,
"calib/step_q_w": 0.3935888501742161,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1873.0,
"completions/max_terminated_length": 1873.0,
"completions/mean_length": 541.3828125,
"completions/mean_terminated_length": 543.5059204101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.005156281869858503,
"kl": 0.0768280029296875,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0131,
"num_tokens": 22224007.0,
"reward": 0.661525309085846,
"reward_std": 0.19307063519954681,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7801464796066284,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2241540551185608,
"step": 91
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6775775245163,
"calib/avg_num_step_conf": 4.24609375,
"calib/ece": 0.0763095238095238,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.06746031746031746,
"calib/gap": 0.10284786641929489,
"calib/mean_conf": 0.6794841269841269,
"calib/mu_c": 0.7194805194805194,
"calib/mu_w": 0.6166326530612245,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.07234126984126983,
"calib/std_conf": 0.15808166636302534,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4584123711340206,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.056303057408530366,
"calib/step_q_w": 0.40210931372549025,
"calib/step_q_w_n": 408.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2203.0,
"completions/max_terminated_length": 2203.0,
"completions/mean_length": 456.45703125,
"completions/mean_terminated_length": 458.2471008300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.006089536473155022,
"kl": 0.0848388671875,
"learning_rate": 3e-06,
"loss": 0.0016,
"num_tokens": 22447580.0,
"reward": 0.6949906349182129,
"reward_std": 0.2117110639810562,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7602003812789917,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.3141559362411499,
"step": 92
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6703247336670518,
"calib/avg_num_step_conf": 5.21875,
"calib/ece": 0.12201581027667986,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.09090909090909091,
"calib/gap": 0.09191503016300873,
"calib/mean_conf": 0.6801976284584981,
"calib/mu_c": 0.7187074829931972,
"calib/mu_w": 0.6267924528301885,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11059288537549408,
"calib/std_conf": 0.1633746164213354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4623680649526387,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.02143004150205241,
"calib/step_q_w": 0.4409380234505863,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 547.13671875,
"completions/mean_terminated_length": 549.2824096679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0992,
"grad_norm": 0.006107079330831766,
"kl": 0.07598114013671875,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0411,
"num_tokens": 22693423.0,
"reward": 0.6424725651741028,
"reward_std": 0.2015823870897293,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7558277249336243,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.21661736071109772,
"step": 93
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7461220597584233,
"calib/avg_num_step_conf": 4.41015625,
"calib/ece": 0.1146640316205533,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.07509881422924901,
"calib/gap": 0.14400699300699316,
"calib/mean_conf": 0.6776679841897233,
"calib/mu_c": 0.7402797202797203,
"calib/mu_w": 0.5962727272727272,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11355731225296437,
"calib/std_conf": 0.17314715474512357,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.46685126582278486,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": 0.06349110485699011,
"calib/step_q_w": 0.40336016096579475,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 474.046875,
"completions/mean_terminated_length": 475.9059143066406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.006255537271499634,
"kl": 0.07927703857421875,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0006,
"num_tokens": 22923459.0,
"reward": 0.6437188386917114,
"reward_std": 0.19185274839401245,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7732378840446472,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2048247754573822,
"step": 94
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7481814291827129,
"calib/avg_num_step_conf": 5.01953125,
"calib/ece": 0.08197628458498027,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.08300395256916997,
"calib/gap": 0.1519940094137785,
"calib/mean_conf": 0.6468774703557312,
"calib/mu_c": 0.6961403508771931,
"calib/mu_w": 0.5441463414634146,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.026482213438735164,
"calib/std_conf": 0.1890315781970204,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4154257641921397,
"calib/step_q_c_n": 916.0,
"calib/step_q_gap": -0.004818138246884651,
"calib/step_q_w": 0.4202439024390244,
"calib/step_q_w_n": 369.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2495.0,
"completions/max_terminated_length": 2495.0,
"completions/mean_length": 528.4921875,
"completions/mean_terminated_length": 530.5647583007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.005513823591172695,
"kl": 0.07430267333984375,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0091,
"num_tokens": 23164881.0,
"reward": 0.7214862108230591,
"reward_std": 0.1887528896331787,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.801451563835144,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.31027084589004517,
"step": 95
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7872743391360413,
"calib/avg_num_step_conf": 4.7265625,
"calib/ece": 0.06224409448818897,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.14173228346456693,
"calib/gap": 0.20052546744036082,
"calib/mean_conf": 0.7112992125984252,
"calib/mu_c": 0.7634042553191488,
"calib/mu_w": 0.562878787878788,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.016692913385826784,
"calib/std_conf": 0.18306093021889827,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45240015816528273,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.11373530802904297,
"calib/step_q_w": 0.33866485013623976,
"calib/step_q_w_n": 367.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2078.0,
"completions/max_terminated_length": 2078.0,
"completions/mean_length": 483.61328125,
"completions/mean_terminated_length": 483.61328125,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.1024,
"grad_norm": 0.005868226755410433,
"kl": 0.08411407470703125,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0118,
"num_tokens": 23394502.0,
"reward": 0.6806752681732178,
"reward_std": 0.17523370683193207,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.8438191413879395,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.1722189486026764,
"step": 96
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6550604423868314,
"calib/avg_num_step_conf": 4.85546875,
"calib/ece": 0.13718253968253968,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.07936507936507936,
"calib/gap": 0.07474537037037043,
"calib/mean_conf": 0.6506746031746031,
"calib/mu_c": 0.6827083333333334,
"calib/mu_w": 0.6079629629629629,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10821428571428574,
"calib/std_conf": 0.18980551154311645,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4440243902439024,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.07150309552499268,
"calib/step_q_w": 0.3725212947189097,
"calib/step_q_w_n": 587.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2937.0,
"completions/max_terminated_length": 2937.0,
"completions/mean_length": 488.90625,
"completions/mean_terminated_length": 492.7558898925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.0057335710152983665,
"kl": 0.08429718017578125,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0263,
"num_tokens": 23624734.0,
"reward": 0.6633927822113037,
"reward_std": 0.23494769632816315,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7351964712142944,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2829952538013458,
"step": 97
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6662457912457912,
"calib/avg_num_step_conf": 4.48046875,
"calib/ece": 0.1360557768924303,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.14342629482071714,
"calib/gap": 0.1102907277907279,
"calib/mean_conf": 0.7010756972111555,
"calib/mu_c": 0.7485314685314687,
"calib/mu_w": 0.6382407407407408,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13370517928286857,
"calib/std_conf": 0.18136720210395232,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43551724137931036,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.06094475067299071,
"calib/step_q_w": 0.37457249070631965,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 521.59765625,
"completions/mean_terminated_length": 525.7047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.0058539328165352345,
"kl": 0.07634735107421875,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.005,
"num_tokens": 23864447.0,
"reward": 0.6528790593147278,
"reward_std": 0.2209436148405075,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7439660429954529,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.25397956371307373,
"step": 98
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6324419285574858,
"calib/avg_num_step_conf": 4.8125,
"calib/ece": 0.22012,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.108,
"calib/gap": 0.09897911380050772,
"calib/mean_conf": 0.62124,
"calib/mu_c": 0.6770642201834863,
"calib/mu_w": 0.5780851063829786,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20268,
"calib/std_conf": 0.23681144904754922,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4099226305609284,
"calib/step_q_c_n": 517.0,
"calib/step_q_gap": 0.06525130188959977,
"calib/step_q_w": 0.34467132867132866,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2261.0,
"completions/max_terminated_length": 2261.0,
"completions/mean_length": 590.921875,
"completions/mean_terminated_length": 597.9288940429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.1056,
"grad_norm": 0.005409719422459602,
"kl": 0.065093994140625,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0199,
"num_tokens": 24121523.0,
"reward": 0.5566372871398926,
"reward_std": 0.23340797424316406,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.695084810256958,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.13850226998329163,
"step": 99
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7833354006450013,
"calib/avg_num_step_conf": 5.05078125,
"calib/ece": 0.1381176470588235,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.1568627450980392,
"calib/gap": 0.2238284544777971,
"calib/mean_conf": 0.6727843137254902,
"calib/mu_c": 0.7746043165467627,
"calib/mu_w": 0.5507758620689656,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1329019607843137,
"calib/std_conf": 0.231705722231293,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.388548051948052,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.051148434357229766,
"calib/step_q_w": 0.3373996175908222,
"calib/step_q_w_n": 523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1983.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 561.5,
"completions/mean_terminated_length": 561.5,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.005751878954470158,
"kl": 0.071319580078125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0012,
"num_tokens": 24372675.0,
"reward": 0.6705541610717773,
"reward_std": 0.19056624174118042,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7899484038352966,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.2433473914861679,
"step": 100
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.67578125,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.159484126984127,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.1349206349206349,
"calib/gap": 0.13800403225806446,
"calib/mean_conf": 0.641468253968254,
"calib/mu_c": 0.709375,
"calib/mu_w": 0.5713709677419355,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14650793650793653,
"calib/std_conf": 0.22893955626051365,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38006134969325156,
"calib/step_q_c_n": 652.0,
"calib/step_q_gap": 0.06513617962522439,
"calib/step_q_w": 0.31492517006802717,
"calib/step_q_w_n": 735.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2913.0,
"completions/max_terminated_length": 2913.0,
"completions/mean_length": 568.0859375,
"completions/mean_terminated_length": 568.0859375,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.005723032634705305,
"kl": 0.0717315673828125,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0576,
"num_tokens": 24625097.0,
"reward": 0.6526573896408081,
"reward_std": 0.21372900903224945,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.73505699634552,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2741639316082001,
"step": 101
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6968745870225981,
"calib/avg_num_step_conf": 4.85546875,
"calib/ece": 0.10972549019607844,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.17254901960784313,
"calib/gap": 0.15897185146028792,
"calib/mean_conf": 0.6816470588235294,
"calib/mu_c": 0.7402484472049689,
"calib/mu_w": 0.5812765957446809,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08000000000000002,
"calib/std_conf": 0.22704329927626454,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3889878542510122,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": 0.02321096182073329,
"calib/step_q_w": 0.3657768924302789,
"calib/step_q_w_n": 502.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1583.0,
"completions/max_terminated_length": 1583.0,
"completions/mean_length": 454.36328125,
"completions/mean_terminated_length": 456.1451110839844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.1088,
"grad_norm": 0.00621482077986002,
"kl": 0.08046722412109375,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0413,
"num_tokens": 24848110.0,
"reward": 0.6459211707115173,
"reward_std": 0.18273219466209412,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7812562584877014,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.18636733293533325,
"step": 102
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.743279569892473,
"calib/avg_num_step_conf": 4.796875,
"calib/ece": 0.05972332015810276,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2015810276679842,
"calib/gap": 0.2100994623655914,
"calib/mean_conf": 0.6450197628458498,
"calib/mu_c": 0.7222500000000001,
"calib/mu_w": 0.5121505376344087,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.03616600790513833,
"calib/std_conf": 0.25069488329564227,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3800127551020408,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.016251493840779563,
"calib/step_q_w": 0.36376126126126124,
"calib/step_q_w_n": 444.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2345.0,
"completions/max_terminated_length": 2345.0,
"completions/mean_length": 573.76953125,
"completions/mean_terminated_length": 573.76953125,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.00558763463050127,
"kl": 0.07482147216796875,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0177,
"num_tokens": 25099547.0,
"reward": 0.6870755553245544,
"reward_std": 0.16483908891677856,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7903547286987305,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2619214355945587,
"step": 103
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.699807369663829,
"calib/avg_num_step_conf": 5.375,
"calib/ece": 0.1439370078740157,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.1062992125984252,
"calib/gap": 0.1560367861803268,
"calib/mean_conf": 0.5893700787401575,
"calib/mu_c": 0.6710743801652892,
"calib/mu_w": 0.5150375939849624,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12846456692913383,
"calib/std_conf": 0.24024838862847026,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42789137380191694,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.06739804046858361,
"calib/step_q_w": 0.36049333333333333,
"calib/step_q_w_n": 750.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 541.4765625,
"completions/mean_terminated_length": 543.6000366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.005712118931114674,
"kl": 0.0785980224609375,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0024,
"num_tokens": 25344845.0,
"reward": 0.6707723140716553,
"reward_std": 0.19318893551826477,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7519944906234741,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2965813875198364,
"step": 104
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6725705329153606,
"calib/avg_num_step_conf": 5.33984375,
"calib/ece": 0.13341176470588229,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.20784313725490197,
"calib/gap": 0.170163009404389,
"calib/mean_conf": 0.6169411764705883,
"calib/mu_c": 0.6903448275862071,
"calib/mu_w": 0.5201818181818181,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09086274509803917,
"calib/std_conf": 0.28378267104058635,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37041958041958045,
"calib/step_q_c_n": 715.0,
"calib/step_q_gap": 0.056830623364365784,
"calib/step_q_w": 0.31358895705521467,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 506.34765625,
"completions/mean_terminated_length": 508.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.112,
"grad_norm": 0.0058001065626740456,
"kl": 0.07666015625,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0332,
"num_tokens": 25580230.0,
"reward": 0.6575684547424316,
"reward_std": 0.20606115460395813,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.752371072769165,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.25026577711105347,
"step": 105
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7404392275653162,
"calib/avg_num_step_conf": 4.6484375,
"calib/ece": 0.0896442687747035,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.17391304347826086,
"calib/gap": 0.22103054398586397,
"calib/mean_conf": 0.6245059288537549,
"calib/mu_c": 0.7241007194244605,
"calib/mu_w": 0.5030701754385966,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08237154150197624,
"calib/std_conf": 0.25921333745136416,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3931710914454277,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.02887421644542776,
"calib/step_q_w": 0.36429687499999996,
"calib/step_q_w_n": 512.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2005.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 490.4765625,
"completions/mean_terminated_length": 490.4765625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.00585206039249897,
"kl": 0.07993316650390625,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0038,
"num_tokens": 25810376.0,
"reward": 0.6838876008987427,
"reward_std": 0.18188489973545074,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.779799222946167,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2817259728908539,
"step": 106
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.587069372345446,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.15594488188976385,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.19291338582677164,
"calib/gap": 0.08754870895975198,
"calib/mean_conf": 0.646732283464567,
"calib/mu_c": 0.6780981595092026,
"calib/mu_w": 0.5905494505494506,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08047244094488193,
"calib/std_conf": 0.253453219800162,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36181850117096026,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": -0.009590589738130706,
"calib/step_q_w": 0.37140909090909097,
"calib/step_q_w_n": 440.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1641.0,
"completions/max_terminated_length": 1641.0,
"completions/mean_length": 475.3515625,
"completions/mean_terminated_length": 479.094482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.005947344470769167,
"kl": 0.08448028564453125,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.0205,
"num_tokens": 26036682.0,
"reward": 0.7219668030738831,
"reward_std": 0.21111388504505157,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7402527332305908,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.3778996169567108,
"step": 107
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6728257275132274,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.1403162055335968,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2924901185770751,
"calib/gap": 0.18115244708994716,
"calib/mean_conf": 0.6612648221343873,
"calib/mu_c": 0.7070899470899472,
"calib/mu_w": 0.5259375000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.027272727272727282,
"calib/std_conf": 0.29326427300611696,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37370160427807486,
"calib/step_q_c_n": 935.0,
"calib/step_q_gap": 0.06415230850342696,
"calib/step_q_w": 0.3095492957746479,
"calib/step_q_w_n": 355.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2140.0,
"completions/max_terminated_length": 2140.0,
"completions/mean_length": 525.47265625,
"completions/mean_terminated_length": 531.7035522460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.1152,
"grad_norm": 0.006274311803281307,
"kl": 0.0753326416015625,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0022,
"num_tokens": 26274435.0,
"reward": 0.7267597913742065,
"reward_std": 0.17975559830665588,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7769195437431335,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.33128753304481506,
"step": 108
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8266497775580821,
"calib/avg_num_step_conf": 5.18359375,
"calib/ece": 0.1487058823529412,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.21568627450980393,
"calib/gap": 0.3182773109243698,
"calib/mean_conf": 0.6082352941176471,
"calib/mu_c": 0.7779831932773109,
"calib/mu_w": 0.45970588235294113,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14513725490196078,
"calib/std_conf": 0.2847007777860333,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39307692307692305,
"calib/step_q_c_n": 559.0,
"calib/step_q_gap": 0.07746494391025638,
"calib/step_q_w": 0.31561197916666667,
"calib/step_q_w_n": 768.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 520.66015625,
"completions/mean_terminated_length": 522.7019653320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.0058349259197711945,
"kl": 0.08112335205078125,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0083,
"num_tokens": 26512324.0,
"reward": 0.6595449447631836,
"reward_std": 0.1664503961801529,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.8052883148193359,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.22161419689655304,
"step": 109
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6886036358897505,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.13408730158730164,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2619047619047619,
"calib/gap": 0.18284681045155393,
"calib/mean_conf": 0.6543253968253968,
"calib/mu_c": 0.7290604026845637,
"calib/mu_w": 0.5462135922330098,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09857142857142867,
"calib/std_conf": 0.284922610750544,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3879663608562691,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.057772562406656736,
"calib/step_q_w": 0.3301937984496124,
"calib/step_q_w_n": 516.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2256.0,
"completions/max_terminated_length": 2256.0,
"completions/mean_length": 477.84375,
"completions/mean_terminated_length": 481.6062927246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.006158989388495684,
"kl": 0.085357666015625,
"learning_rate": 2.5e-06,
"loss": 0.0093,
"num_tokens": 26739572.0,
"reward": 0.685361385345459,
"reward_std": 0.21978026628494263,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7471511363983154,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.3110716640949249,
"step": 110
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7192771845580337,
"calib/avg_num_step_conf": 4.50390625,
"calib/ece": 0.17658730158730157,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.30952380952380953,
"calib/gap": 0.219637328615657,
"calib/mean_conf": 0.6507936507936508,
"calib/mu_c": 0.7545112781954888,
"calib/mu_w": 0.5348739495798318,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14980158730158727,
"calib/std_conf": 0.30543126723913944,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3840105078809107,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.03663937386029209,
"calib/step_q_w": 0.3473711340206186,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2691.0,
"completions/max_terminated_length": 2691.0,
"completions/mean_length": 554.46875,
"completions/mean_terminated_length": 554.46875,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1184,
"grad_norm": 0.0054856035858392715,
"kl": 0.07257080078125,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0214,
"num_tokens": 26988924.0,
"reward": 0.6293383836746216,
"reward_std": 0.21292832493782043,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.740082859992981,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.21781271696090698,
"step": 111
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7825503355704698,
"calib/avg_num_step_conf": 4.78515625,
"calib/ece": 0.12527559055118107,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.21653543307086615,
"calib/gap": 0.3289434324065196,
"calib/mean_conf": 0.5626771653543307,
"calib/mu_c": 0.6986577181208053,
"calib/mu_w": 0.3697142857142858,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.050669291338582625,
"calib/std_conf": 0.3290596212853663,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.376,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.07784905660377361,
"calib/step_q_w": 0.2981509433962264,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 551.30078125,
"completions/mean_terminated_length": 551.30078125,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.006195261608809233,
"kl": 0.0721588134765625,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0561,
"num_tokens": 27237977.0,
"reward": 0.6678087711334229,
"reward_std": 0.19492530822753906,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.8018710613250732,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2189026176929474,
"step": 112
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7812828947368422,
"calib/avg_num_step_conf": 5.14453125,
"calib/ece": 0.08226190476190479,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.31746031746031744,
"calib/gap": 0.30880263157894733,
"calib/mean_conf": 0.6672619047619048,
"calib/mu_c": 0.7898026315789474,
"calib/mu_w": 0.48100000000000004,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0731746031746032,
"calib/std_conf": 0.3011262636237346,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3532605729877217,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.04346605243977647,
"calib/step_q_w": 0.3097945205479452,
"calib/step_q_w_n": 584.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2323.0,
"completions/max_terminated_length": 2323.0,
"completions/mean_length": 497.7265625,
"completions/mean_terminated_length": 499.678466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.00658282358199358,
"kl": 0.0800018310546875,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0095,
"num_tokens": 27470595.0,
"reward": 0.7177433371543884,
"reward_std": 0.21778149902820587,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.8009738326072693,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.31810659170150757,
"step": 113
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7824581005586593,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.09307086614173234,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3346456692913386,
"calib/gap": 0.33646480446927385,
"calib/mean_conf": 0.6803149606299213,
"calib/mu_c": 0.7796648044692738,
"calib/mu_w": 0.4431999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.034330708661417374,
"calib/std_conf": 0.3222080244749067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40392541176470587,
"calib/step_q_c_n": 850.0,
"calib/step_q_gap": 0.12774359358288767,
"calib/step_q_w": 0.2761818181818182,
"calib/step_q_w_n": 440.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 485.6484375,
"completions/mean_terminated_length": 487.552978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.1216,
"grad_norm": 0.8515591621398926,
"kl": 4.70794677734375,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0792,
"num_tokens": 27699945.0,
"reward": 0.7293667197227478,
"reward_std": 0.1959434449672699,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.8210617303848267,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2993904948234558,
"step": 114
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6217948717948718,
"calib/avg_num_step_conf": 4.625,
"calib/ece": 0.1993307086614174,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.33858267716535434,
"calib/gap": 0.1435635792778649,
"calib/mean_conf": 0.6880708661417322,
"calib/mu_c": 0.7434615384615384,
"calib/mu_w": 0.5998979591836735,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1366141732283465,
"calib/std_conf": 0.29743493117836134,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38166666666666665,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.015538348082595854,
"calib/step_q_w": 0.3661283185840708,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2213.0,
"completions/max_terminated_length": 2213.0,
"completions/mean_length": 444.4609375,
"completions/mean_terminated_length": 444.4609375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.006345377303659916,
"kl": 0.08205413818359375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0121,
"num_tokens": 27918991.0,
"reward": 0.6729586720466614,
"reward_std": 0.22253727912902832,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7313871383666992,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.29343652725219727,
"step": 115
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6342528591177579,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.1986666666666666,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2980392156862745,
"calib/gap": 0.16562272213145668,
"calib/mean_conf": 0.6034509803921568,
"calib/mu_c": 0.6742465753424658,
"calib/mu_w": 0.5086238532110091,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11478431372549017,
"calib/std_conf": 0.3466295029321082,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.39010043041606884,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.0578584375335065,
"calib/step_q_w": 0.33224199288256234,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1177.0,
"completions/max_terminated_length": 1177.0,
"completions/mean_length": 508.07421875,
"completions/mean_terminated_length": 510.06671142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.005793208722025156,
"kl": 0.0783233642578125,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0129,
"num_tokens": 28153578.0,
"reward": 0.6464129686355591,
"reward_std": 0.22290663421154022,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7034027576446533,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2784857451915741,
"step": 116
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7134639303482587,
"calib/avg_num_step_conf": 5.171875,
"calib/ece": 0.16669291338582673,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2992125984251969,
"calib/gap": 0.22972388059701487,
"calib/mean_conf": 0.6566929133858267,
"calib/mu_c": 0.7652238805970148,
"calib/mu_w": 0.5355,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1479133858267716,
"calib/std_conf": 0.3043142675168991,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41185459940652813,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.058885368637297375,
"calib/step_q_w": 0.35296923076923076,
"calib/step_q_w_n": 650.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2356.0,
"completions/max_terminated_length": 2356.0,
"completions/mean_length": 490.5,
"completions/mean_terminated_length": 490.5,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.1248,
"grad_norm": 0.005411152727901936,
"kl": 0.0897064208984375,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0368,
"num_tokens": 28385746.0,
"reward": 0.6755538582801819,
"reward_std": 0.21486549079418182,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7500835657119751,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.29789912700653076,
"step": 117
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7546408137317229,
"calib/avg_num_step_conf": 5.94140625,
"calib/ece": 0.1397233201581028,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": 0.3244475524475524,
"calib/mean_conf": 0.6468379446640315,
"calib/mu_c": 0.7879020979020979,
"calib/mu_w": 0.46345454545454545,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1106719367588933,
"calib/std_conf": 0.3476224745399264,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35203726708074534,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.045550255907560955,
"calib/step_q_w": 0.3064870111731844,
"calib/step_q_w_n": 716.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2318.0,
"completions/max_terminated_length": 2318.0,
"completions/mean_length": 551.515625,
"completions/mean_terminated_length": 555.8582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.005002293735742569,
"kl": 0.08039093017578125,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0024,
"num_tokens": 28630942.0,
"reward": 0.6808527708053589,
"reward_std": 0.1966305822134018,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7770004272460938,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.27533018589019775,
"step": 118
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7247474747474748,
"calib/avg_num_step_conf": 4.94921875,
"calib/ece": 0.1542913385826772,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3031496062992126,
"calib/gap": 0.2739583333333333,
"calib/mean_conf": 0.5953149606299212,
"calib/mu_c": 0.7139583333333334,
"calib/mu_w": 0.44000000000000006,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09133858267716542,
"calib/std_conf": 0.3456850143501426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41453235294117646,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.046230251862244054,
"calib/step_q_w": 0.3683021010789324,
"calib/step_q_w_n": 587.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2349.0,
"completions/max_terminated_length": 2349.0,
"completions/mean_length": 551.5078125,
"completions/mean_terminated_length": 553.6705932617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.005192252341657877,
"kl": 0.08580780029296875,
"learning_rate": 2.25e-06,
"loss": 0.0301,
"num_tokens": 28877192.0,
"reward": 0.6779056191444397,
"reward_std": 0.20860622823238373,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7626948952674866,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2821788191795349,
"step": 119
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7398603723404256,
"calib/avg_num_step_conf": 5.15234375,
"calib/ece": 0.1735826771653544,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2559055118110236,
"calib/gap": 0.30309840425531914,
"calib/mean_conf": 0.5672047244094488,
"calib/mu_c": 0.679375,
"calib/mu_w": 0.3762765957446808,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05543307086614178,
"calib/std_conf": 0.35620879567946134,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41292326431181486,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.047943344633100016,
"calib/step_q_w": 0.36497991967871485,
"calib/step_q_w_n": 498.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2306.0,
"completions/max_terminated_length": 2306.0,
"completions/mean_length": 495.671875,
"completions/mean_terminated_length": 495.671875,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.128,
"grad_norm": 0.005819946061819792,
"kl": 0.0976715087890625,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0498,
"num_tokens": 29110772.0,
"reward": 0.709846019744873,
"reward_std": 0.21084654331207275,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7673988342285156,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.32963696122169495,
"step": 120
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6623596891575182,
"calib/avg_num_step_conf": 5.34375,
"calib/ece": 0.1974117647058823,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.19373072653262613,
"calib/mean_conf": 0.6141176470588235,
"calib/mu_c": 0.706044776119403,
"calib/mu_w": 0.5123140495867768,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1430196078431372,
"calib/std_conf": 0.33535568750143735,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4042550847457627,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": 0.007709630200308126,
"calib/step_q_w": 0.3965454545454546,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2152.0,
"completions/max_terminated_length": 2152.0,
"completions/mean_length": 549.84375,
"completions/mean_terminated_length": 552.0000610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.004886144772171974,
"kl": 0.09521484375,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0044,
"num_tokens": 29356588.0,
"reward": 0.6371598243713379,
"reward_std": 0.26278766989707947,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7241054773330688,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.24630790948867798,
"step": 121
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7403474903474905,
"calib/avg_num_step_conf": 4.78515625,
"calib/ece": 0.13984063745019926,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2868525896414343,
"calib/gap": 0.31731209781209774,
"calib/mean_conf": 0.6520318725099602,
"calib/mu_c": 0.7923571428571429,
"calib/mu_w": 0.47504504504504513,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1170517928286853,
"calib/std_conf": 0.33847833703431246,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43339848484848487,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.08638963529096272,
"calib/step_q_w": 0.34700884955752215,
"calib/step_q_w_n": 565.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2206.0,
"completions/max_terminated_length": 2206.0,
"completions/mean_length": 516.546875,
"completions/mean_terminated_length": 518.5725708007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.004893404431641102,
"kl": 0.10149383544921875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0062,
"num_tokens": 29596168.0,
"reward": 0.6806086301803589,
"reward_std": 0.2333202213048935,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7710624933242798,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2846860885620117,
"step": 122
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5823994935106047,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.2592857142857143,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2698412698412698,
"calib/gap": 0.10882051282051297,
"calib/mean_conf": 0.6021428571428572,
"calib/mu_c": 0.6526666666666667,
"calib/mu_w": 0.5438461538461538,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16285714285714287,
"calib/std_conf": 0.34300864708165757,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44494976377952755,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.07440345910381113,
"calib/step_q_w": 0.3705463046757164,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 577.77734375,
"completions/mean_terminated_length": 580.0431518554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.1312,
"grad_norm": 0.004810995887964964,
"kl": 0.0954742431640625,
"learning_rate": 2.138888888888889e-06,
"loss": 0.0226,
"num_tokens": 29849367.0,
"reward": 0.5776532888412476,
"reward_std": 0.2678280472755432,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6648507714271545,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.1896745264530182,
"step": 123
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6507565789473684,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.19662745098039225,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2784313725490196,
"calib/gap": 0.1785657894736843,
"calib/mean_conf": 0.6257254901960785,
"calib/mu_c": 0.69225,
"calib/mu_w": 0.5136842105263157,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09745098039215695,
"calib/std_conf": 0.33533384468237853,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44340931677018636,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.010309316770186372,
"calib/step_q_w": 0.4331,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1759.0,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 510.94140625,
"completions/mean_terminated_length": 512.9451293945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.004809759557247162,
"kl": 0.1109619140625,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0343,
"num_tokens": 30086984.0,
"reward": 0.6774014234542847,
"reward_std": 0.2310647964477539,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7343937158584595,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.29619038105010986,
"step": 124
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5847081999507511,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.2771372549019608,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2784313725490196,
"calib/gap": 0.10810268406796364,
"calib/mean_conf": 0.6024705882352941,
"calib/mu_c": 0.6550381679389313,
"calib/mu_w": 0.5469354838709677,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18294117647058825,
"calib/std_conf": 0.3478254212169548,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42561712439418414,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.05229015646415497,
"calib/step_q_w": 0.3733269679300292,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1760.0,
"completions/max_terminated_length": 1760.0,
"completions/mean_length": 538.71484375,
"completions/mean_terminated_length": 540.8274536132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.00505201518535614,
"kl": 0.10394287109375,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0273,
"num_tokens": 30329703.0,
"reward": 0.5742524862289429,
"reward_std": 0.24623748660087585,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6717531681060791,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.17597052454948425,
"step": 125
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6400856638951877,
"calib/avg_num_step_conf": 5.47265625,
"calib/ece": 0.2600210317460318,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.31746031746031744,
"calib/gap": 0.17258174603174614,
"calib/mean_conf": 0.6055345238095238,
"calib/mu_c": 0.6918253968253968,
"calib/mu_w": 0.5192436507936506,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18277777777777784,
"calib/std_conf": 0.3660238688636956,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4356067880794702,
"calib/step_q_c_n": 604.0,
"calib/step_q_gap": 0.09701318707570605,
"calib/step_q_w": 0.3385936010037641,
"calib/step_q_w_n": 797.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 528.65625,
"completions/mean_terminated_length": 532.8189086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.1344,
"grad_norm": 0.004750107880681753,
"kl": 0.0997772216796875,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0065,
"num_tokens": 30570503.0,
"reward": 0.5669196248054504,
"reward_std": 0.23805102705955505,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6774504780769348,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.16185757517814636,
"step": 126
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6682522681451614,
"calib/avg_num_step_conf": 5.37890625,
"calib/ece": 0.21242063492063484,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2777777777777778,
"calib/gap": 0.22329637096774185,
"calib/mean_conf": 0.6054365079365079,
"calib/mu_c": 0.7153125,
"calib/mu_w": 0.49201612903225816,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1549603174603174,
"calib/std_conf": 0.35511373085502307,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41818784194528874,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.04134959437922481,
"calib/step_q_w": 0.37683824756606393,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2279.0,
"completions/max_terminated_length": 2279.0,
"completions/mean_length": 498.65625,
"completions/mean_terminated_length": 502.5826721191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.006005220580846071,
"kl": 0.115814208984375,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0036,
"num_tokens": 30801831.0,
"reward": 0.5964082479476929,
"reward_std": 0.22729472815990448,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7108261585235596,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.1858965903520584,
"step": 127
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.636987619475271,
"calib/avg_num_step_conf": 4.7265625,
"calib/ece": 0.21941480000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.252,
"calib/gap": 0.18101319520174475,
"calib/mean_conf": 0.5682651999999999,
"calib/mu_c": 0.6544274809160305,
"calib/mu_w": 0.4734142857142858,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13184,
"calib/std_conf": 0.36506893216070857,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43656195462478187,
"calib/step_q_c_n": 573.0,
"calib/step_q_gap": 0.11015112259966414,
"calib/step_q_w": 0.3264108320251177,
"calib/step_q_w_n": 637.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2157.0,
"completions/max_terminated_length": 2157.0,
"completions/mean_length": 511.12890625,
"completions/mean_terminated_length": 519.2420654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.005148904863744974,
"kl": 0.119293212890625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0198,
"num_tokens": 31039344.0,
"reward": 0.571398138999939,
"reward_std": 0.23523159325122833,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.689100980758667,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.15603910386562347,
"step": 128
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5945349952061362,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.27204724409448816,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3031496062992126,
"calib/gap": 0.0967350591243209,
"calib/mean_conf": 0.6659842519685041,
"calib/mu_c": 0.7059731543624161,
"calib/mu_w": 0.6092380952380952,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17570866141732283,
"calib/std_conf": 0.32659001687448785,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43590402075226975,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": -0.014559651902420867,
"calib/step_q_w": 0.4504636726546906,
"calib/step_q_w_n": 501.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1996.0,
"completions/max_terminated_length": 1996.0,
"completions/mean_length": 461.68359375,
"completions/mean_terminated_length": 461.68359375,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.1376,
"grad_norm": 0.005383655428886414,
"kl": 0.1225128173828125,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0319,
"num_tokens": 31259919.0,
"reward": 0.6519710421562195,
"reward_std": 0.24818632006645203,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6860554814338684,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.30304285883903503,
"step": 129
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7220584381551363,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.13090196078431376,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3058823529411765,
"calib/gap": 0.26196737421383637,
"calib/mean_conf": 0.6494901960784314,
"calib/mu_c": 0.7481132075471697,
"calib/mu_w": 0.48614583333333333,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07843137254901965,
"calib/std_conf": 0.33223446555100444,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46137522768670314,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.050129702394874354,
"calib/step_q_w": 0.4112455252918288,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2370.0,
"completions/max_terminated_length": 2370.0,
"completions/mean_length": 479.08984375,
"completions/mean_terminated_length": 479.08984375,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.005056967493146658,
"kl": 0.123077392578125,
"learning_rate": 1.944444444444445e-06,
"loss": 0.002,
"num_tokens": 31487854.0,
"reward": 0.6769087314605713,
"reward_std": 0.18351736664772034,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7702523469924927,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2609088718891144,
"step": 130
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6623878364905285,
"calib/avg_num_step_conf": 4.953125,
"calib/ece": 0.22767716535433072,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.1968503937007874,
"calib/gap": 0.19071660019940173,
"calib/mean_conf": 0.5394094488188976,
"calib/mu_c": 0.6415254237288135,
"calib/mu_w": 0.45080882352941176,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15125984251968505,
"calib/std_conf": 0.34153204745423066,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4148675721561969,
"calib/step_q_c_n": 589.0,
"calib/step_q_gap": 0.016053117878910728,
"calib/step_q_w": 0.3988144542772862,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2269.0,
"completions/max_terminated_length": 2269.0,
"completions/mean_length": 481.52734375,
"completions/mean_terminated_length": 481.52734375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.005481308791786432,
"kl": 0.1237945556640625,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0127,
"num_tokens": 31717333.0,
"reward": 0.6147300601005554,
"reward_std": 0.19411855936050415,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7143527269363403,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.22526362538337708,
"step": 131
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7591631423811354,
"calib/avg_num_step_conf": 4.703125,
"calib/ece": 0.14913725490196073,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.28627450980392155,
"calib/gap": 0.32385620915032676,
"calib/mean_conf": 0.6092156862745098,
"calib/mu_c": 0.7387581699346405,
"calib/mu_w": 0.41490196078431374,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07917647058823526,
"calib/std_conf": 0.3581326467876354,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.447086889818689,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.11053658181047543,
"calib/step_q_w": 0.33655030800821356,
"calib/step_q_w_n": 487.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1378.0,
"completions/max_terminated_length": 1378.0,
"completions/mean_length": 499.546875,
"completions/mean_terminated_length": 501.50592041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1408,
"grad_norm": 0.005307801067829132,
"kl": 0.12115478515625,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0038,
"num_tokens": 31950809.0,
"reward": 0.676338791847229,
"reward_std": 0.23184099793434143,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.783726155757904,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.25098270177841187,
"step": 132
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6145109395109395,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.28905138339920944,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2648221343873518,
"calib/gap": 0.15012612612612608,
"calib/mean_conf": 0.5928458498023715,
"calib/mu_c": 0.6806666666666666,
"calib/mu_w": 0.5305405405405406,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23343873517786556,
"calib/std_conf": 0.3546771119033089,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42118825910931174,
"calib/step_q_c_n": 494.0,
"calib/step_q_gap": 0.05738959244264508,
"calib/step_q_w": 0.36379866666666666,
"calib/step_q_w_n": 750.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1809.0,
"completions/max_terminated_length": 1809.0,
"completions/mean_length": 552.0078125,
"completions/mean_terminated_length": 556.3543090820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.0047843498177826405,
"kl": 0.1084136962890625,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0054,
"num_tokens": 32198467.0,
"reward": 0.5920966267585754,
"reward_std": 0.2622288465499878,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6648152470588684,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.23969054222106934,
"step": 133
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.774765478424015,
"calib/avg_num_step_conf": 4.70703125,
"calib/ece": 0.12474308300395254,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.25691699604743085,
"calib/gap": 0.3447091932457787,
"calib/mean_conf": 0.5839525691699605,
"calib/mu_c": 0.7515384615384616,
"calib/mu_w": 0.4068292682926829,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09743083003952566,
"calib/std_conf": 0.35480578348636826,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.410953947368421,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.057947582209292026,
"calib/step_q_w": 0.353006365159129,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1944.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 544.9453125,
"completions/mean_terminated_length": 549.2362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.004672987386584282,
"kl": 0.1106719970703125,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0134,
"num_tokens": 32446925.0,
"reward": 0.6539692282676697,
"reward_std": 0.23669315874576569,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7823336124420166,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.22638607025146484,
"step": 134
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6206156716417911,
"calib/avg_num_step_conf": 4.6328125,
"calib/ece": 0.24350393700787404,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2952755905511811,
"calib/gap": 0.14422388059701496,
"calib/mean_conf": 0.6179133858267716,
"calib/mu_c": 0.6940000000000001,
"calib/mu_w": 0.5497761194029851,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19448818897637796,
"calib/std_conf": 0.33028974451628174,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42056243781094527,
"calib/step_q_c_n": 536.0,
"calib/step_q_gap": 0.04271284806735548,
"calib/step_q_w": 0.3778495897435898,
"calib/step_q_w_n": 650.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 548.71484375,
"completions/mean_terminated_length": 548.71484375,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.144,
"grad_norm": 0.004846002906560898,
"kl": 0.110809326171875,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0157,
"num_tokens": 32693276.0,
"reward": 0.6306657195091248,
"reward_std": 0.2516302466392517,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.686989426612854,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2829357087612152,
"step": 135
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7482587064676616,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.16629921259842523,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2637795275590551,
"calib/gap": 0.3096194029850746,
"calib/mean_conf": 0.5501574803149607,
"calib/mu_c": 0.7135,
"calib/mu_w": 0.4038805970149254,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1220078740157481,
"calib/std_conf": 0.3632345595874525,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.39179575163398694,
"calib/step_q_c_n": 612.0,
"calib/step_q_gap": 0.036409746544928456,
"calib/step_q_w": 0.3553860050890585,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2233.0,
"completions/max_terminated_length": 2233.0,
"completions/mean_length": 495.49609375,
"completions/mean_terminated_length": 495.49609375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.005031277425587177,
"kl": 0.1255035400390625,
"learning_rate": 1.777777777777778e-06,
"loss": -0.007,
"num_tokens": 32928611.0,
"reward": 0.658591628074646,
"reward_std": 0.2328774631023407,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7611265778541565,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2638690769672394,
"step": 136
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6497611865258924,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.22667984189723317,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.30039525691699603,
"calib/gap": 0.17296694318753136,
"calib/mean_conf": 0.6133201581027667,
"calib/mu_c": 0.6933088235294117,
"calib/mu_w": 0.5203418803418803,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15122529644268773,
"calib/std_conf": 0.3469324128412001,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.385613630406291,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": 0.010008560476221118,
"calib/step_q_w": 0.37560506993006987,
"calib/step_q_w_n": 572.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2087.0,
"completions/max_terminated_length": 2087.0,
"completions/mean_length": 489.1171875,
"completions/mean_terminated_length": 491.0353088378906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.004626833368092775,
"kl": 0.1271209716796875,
"learning_rate": 1.75e-06,
"loss": 0.0013,
"num_tokens": 33160809.0,
"reward": 0.6661921739578247,
"reward_std": 0.2168254852294922,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7029668092727661,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.32551121711730957,
"step": 137
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7155593093093093,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.15832031250000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.2578125,
"calib/gap": 0.23970220220220223,
"calib/mean_conf": 0.5904296874999999,
"calib/mu_c": 0.691554054054054,
"calib/mu_w": 0.4518518518518518,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08531250000000007,
"calib/std_conf": 0.34287988019662563,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4437465181058496,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.07487927672653927,
"calib/step_q_w": 0.3688672413793103,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1278.0,
"completions/max_terminated_length": 1278.0,
"completions/mean_length": 502.1015625,
"completions/mean_terminated_length": 504.07061767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1472,
"grad_norm": 0.005026695318520069,
"kl": 0.1251373291015625,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0011,
"num_tokens": 33393683.0,
"reward": 0.6768547296524048,
"reward_std": 0.22881154716014862,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7553105354309082,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.2827739715576172,
"step": 138
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7254982600442897,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.16476377952755902,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2795275590551181,
"calib/gap": 0.27408984498576405,
"calib/mean_conf": 0.6055511811023622,
"calib/mu_c": 0.7231724137931036,
"calib/mu_w": 0.44908256880733954,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09972440944881886,
"calib/std_conf": 0.35066569333605613,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4365808823529412,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.027864926879842877,
"calib/step_q_w": 0.4087159554730983,
"calib/step_q_w_n": 539.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2811.0,
"completions/max_terminated_length": 2811.0,
"completions/mean_length": 458.19921875,
"completions/mean_terminated_length": 459.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.005129787139594555,
"kl": 0.13323974609375,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0286,
"num_tokens": 33614078.0,
"reward": 0.6852666139602661,
"reward_std": 0.2394963949918747,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7552605271339417,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.30433520674705505,
"step": 139
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8103160842891437,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.10023529411764698,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.3137254901960784,
"calib/gap": 0.3649046412376634,
"calib/mean_conf": 0.6422745098039215,
"calib/mu_c": 0.7739263803680982,
"calib/mu_w": 0.4090217391304348,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.051647058823529324,
"calib/std_conf": 0.3423327173237316,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4144586070959264,
"calib/step_q_c_n": 761.0,
"calib/step_q_gap": -0.002594204640014919,
"calib/step_q_w": 0.41705281173594133,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 488.71484375,
"completions/mean_terminated_length": 488.71484375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.005066412966698408,
"kl": 0.129180908203125,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0392,
"num_tokens": 33844205.0,
"reward": 0.7103152275085449,
"reward_std": 0.2360602468252182,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.8172827959060669,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.27678507566452026,
"step": 140
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7928881123406459,
"calib/avg_num_step_conf": 4.64453125,
"calib/ece": 0.10885826771653549,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2874015748031496,
"calib/gap": 0.36308613214262603,
"calib/mean_conf": 0.6105905511811023,
"calib/mu_c": 0.754967320261438,
"calib/mu_w": 0.39188118811881195,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05854330708661423,
"calib/std_conf": 0.3468622648499856,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42326777456647396,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.048438800723415554,
"calib/step_q_w": 0.3748289738430584,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2038.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 529.62109375,
"completions/mean_terminated_length": 529.62109375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.1504,
"grad_norm": 0.004820066969841719,
"kl": 0.1171722412109375,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0115,
"num_tokens": 34086884.0,
"reward": 0.6914817094802856,
"reward_std": 0.22751720249652863,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.8076714873313904,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.25732317566871643,
"step": 141
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7499068554396424,
"calib/avg_num_step_conf": 4.9609375,
"calib/ece": 0.14440944881889772,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2440944881889764,
"calib/gap": 0.3235059612518629,
"calib/mean_conf": 0.5598425196850393,
"calib/mu_c": 0.7152272727272727,
"calib/mu_w": 0.39172131147540984,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09228346456692924,
"calib/std_conf": 0.3597078522627466,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4225084745762712,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.05316870001910534,
"calib/step_q_w": 0.3693397745571659,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1377.0,
"completions/max_terminated_length": 1377.0,
"completions/mean_length": 509.4375,
"completions/mean_terminated_length": 511.4353332519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.005076853092759848,
"kl": 0.125946044921875,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0051,
"num_tokens": 34322460.0,
"reward": 0.6526362895965576,
"reward_std": 0.21421653032302856,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7747867107391357,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.22892338037490845,
"step": 142
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7989877769289535,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.11095617529880475,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2589641434262948,
"calib/gap": 0.3715457091927679,
"calib/mean_conf": 0.5635458167330677,
"calib/mu_c": 0.7396969696969696,
"calib/mu_w": 0.3681512605042017,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07430278884462146,
"calib/std_conf": 0.35467028750808266,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.414310294117647,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.0897075570697487,
"calib/step_q_w": 0.3246027370478983,
"calib/step_q_w_n": 682.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2484.0,
"completions/max_terminated_length": 2484.0,
"completions/mean_length": 520.80078125,
"completions/mean_terminated_length": 524.9015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.00469217961654067,
"kl": 0.1230316162109375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0088,
"num_tokens": 34563121.0,
"reward": 0.7117146253585815,
"reward_std": 0.20838120579719543,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7929409742355347,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.33126944303512573,
"step": 143
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6699095911949686,
"calib/avg_num_step_conf": 4.92578125,
"calib/ece": 0.20898039215686273,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.22745098039215686,
"calib/gap": 0.22264937106918237,
"calib/mean_conf": 0.5367450980392157,
"calib/mu_c": 0.6205660377358491,
"calib/mu_w": 0.3979166666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.061098039215686274,
"calib/std_conf": 0.35905430487395606,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.38659533742331287,
"calib/step_q_c_n": 815.0,
"calib/step_q_gap": -0.031072824011664724,
"calib/step_q_w": 0.4176681614349776,
"calib/step_q_w_n": 446.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2631.0,
"completions/max_terminated_length": 2631.0,
"completions/mean_length": 509.140625,
"completions/mean_terminated_length": 509.140625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.1536,
"grad_norm": 0.004818373825401068,
"kl": 0.13165283203125,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0273,
"num_tokens": 34797589.0,
"reward": 0.672430157661438,
"reward_std": 0.2170725166797638,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7267078161239624,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.29549625515937805,
"step": 144
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6520072019205122,
"calib/avg_num_step_conf": 5.19140625,
"calib/ece": 0.19156862745098044,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2980392156862745,
"calib/gap": 0.1672225926913844,
"calib/mean_conf": 0.622,
"calib/mu_c": 0.6823312883435583,
"calib/mu_w": 0.5151086956521739,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0871764705882353,
"calib/std_conf": 0.3363044838867792,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4111351351351351,
"calib/step_q_c_n": 851.0,
"calib/step_q_gap": 0.03382760375438204,
"calib/step_q_w": 0.37730753138075307,
"calib/step_q_w_n": 478.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1092.0,
"completions/max_terminated_length": 1092.0,
"completions/mean_length": 471.890625,
"completions/mean_terminated_length": 473.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.004987073130905628,
"kl": 0.125579833984375,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0031,
"num_tokens": 35021097.0,
"reward": 0.644719123840332,
"reward_std": 0.2383957952260971,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7302496433258057,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.23262612521648407,
"step": 145
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6666347381864624,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.21276679841897234,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2608695652173913,
"calib/gap": 0.23161430395913146,
"calib/mean_conf": 0.5546640316205534,
"calib/mu_c": 0.6874074074074074,
"calib/mu_w": 0.4557931034482759,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17027667984189726,
"calib/std_conf": 0.3571411322026893,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3992572463768116,
"calib/step_q_c_n": 552.0,
"calib/step_q_gap": 0.027237056674914606,
"calib/step_q_w": 0.372020189701897,
"calib/step_q_w_n": 738.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2317.0,
"completions/max_terminated_length": 2317.0,
"completions/mean_length": 523.33984375,
"completions/mean_terminated_length": 523.33984375,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.005135540850460529,
"kl": 0.1239166259765625,
"learning_rate": 1.5e-06,
"loss": -0.0058,
"num_tokens": 35262288.0,
"reward": 0.620847225189209,
"reward_std": 0.20095868408679962,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.7163043022155762,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2433588057756424,
"step": 146
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6535714285714286,
"calib/avg_num_step_conf": 5.19921875,
"calib/ece": 0.22660079051383403,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2924901185770751,
"calib/gap": 0.1993408521303257,
"calib/mean_conf": 0.6015415019762845,
"calib/mu_c": 0.7063333333333333,
"calib/mu_w": 0.5069924812030076,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17691699604743083,
"calib/std_conf": 0.35387018239051427,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43867449664429536,
"calib/step_q_c_n": 596.0,
"calib/step_q_gap": 0.08102551705245858,
"calib/step_q_w": 0.3576489795918368,
"calib/step_q_w_n": 735.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1207.0,
"completions/max_terminated_length": 1207.0,
"completions/mean_length": 514.41015625,
"completions/mean_terminated_length": 516.427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1568,
"grad_norm": 0.005366252735257149,
"kl": 0.1230010986328125,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0171,
"num_tokens": 35497657.0,
"reward": 0.6017385721206665,
"reward_std": 0.2290113866329193,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7003504037857056,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2101580798625946,
"step": 147
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7547921071176885,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.14605577689243032,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": 0.32399859055673014,
"calib/mean_conf": 0.6243824701195219,
"calib/mu_c": 0.7353939393939394,
"calib/mu_w": 0.41139534883720924,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05653386454183271,
"calib/std_conf": 0.34655492474931315,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4162940024479804,
"calib/step_q_c_n": 817.0,
"calib/step_q_gap": 0.0164826816932635,
"calib/step_q_w": 0.3998113207547169,
"calib/step_q_w_n": 477.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1845.0,
"completions/max_terminated_length": 1845.0,
"completions/mean_length": 483.390625,
"completions/mean_terminated_length": 489.12255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.005183129571378231,
"kl": 0.1303863525390625,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0061,
"num_tokens": 35726517.0,
"reward": 0.7119890451431274,
"reward_std": 0.21605724096298218,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7839125394821167,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.3150656521320343,
"step": 148
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7620216836734693,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.12071428571428566,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.30952380952380953,
"calib/gap": 0.34632142857142867,
"calib/mean_conf": 0.5707936507936507,
"calib/mu_c": 0.7247142857142858,
"calib/mu_w": 0.3783928571428571,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06797619047619043,
"calib/std_conf": 0.3578653714550651,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4296190839694657,
"calib/step_q_c_n": 655.0,
"calib/step_q_gap": 0.10079003707082723,
"calib/step_q_w": 0.32882904689863846,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 543.54296875,
"completions/mean_terminated_length": 545.674560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.0045463708229362965,
"kl": 0.1185455322265625,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0153,
"num_tokens": 35970120.0,
"reward": 0.6921254396438599,
"reward_std": 0.22323009371757507,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7795078158378601,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.29927438497543335,
"step": 149
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7414935064935065,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.17824000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.372,
"calib/gap": 0.29605844155844135,
"calib/mean_conf": 0.64152,
"calib/mu_c": 0.7717857142857142,
"calib/mu_w": 0.47572727272727283,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12988000000000002,
"calib/std_conf": 0.3545544945420943,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4243561643835616,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.0568515666824122,
"calib/step_q_w": 0.3675045977011494,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 484.84765625,
"completions/mean_terminated_length": 484.84765625,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.16,
"grad_norm": 0.004933979362249374,
"kl": 0.1362152099609375,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0212,
"num_tokens": 36199201.0,
"reward": 0.6484754085540771,
"reward_std": 0.2582949995994568,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7452633380889893,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.2470000684261322,
"step": 150
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7442561205273069,
"calib/avg_num_step_conf": 4.78125,
"calib/ece": 0.15478260869565222,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.25296442687747034,
"calib/gap": 0.3273986189579412,
"calib/mean_conf": 0.5369960474308301,
"calib/mu_c": 0.7116949152542374,
"calib/mu_w": 0.38429629629629625,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11268774703557316,
"calib/std_conf": 0.3602179675892091,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40124333925399647,
"calib/step_q_c_n": 563.0,
"calib/step_q_gap": 0.04791353592570602,
"calib/step_q_w": 0.35332980332829045,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2731.0,
"completions/max_terminated_length": 2731.0,
"completions/mean_length": 530.87890625,
"completions/mean_terminated_length": 532.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.005048654042184353,
"kl": 0.119140625,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0143,
"num_tokens": 36442130.0,
"reward": 0.6133000254631042,
"reward_std": 0.21529051661491394,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.770215630531311,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.1665406972169876,
"step": 151
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6082205029013539,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.22011952191235057,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.17131474103585656,
"calib/gap": 0.12902256608639584,
"calib/mean_conf": 0.5024302788844621,
"calib/mu_c": 0.5749090909090909,
"calib/mu_w": 0.4458865248226951,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1421513944223107,
"calib/std_conf": 0.3419373997941873,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4019067415730337,
"calib/step_q_c_n": 534.0,
"calib/step_q_gap": 0.04298010064639274,
"calib/step_q_w": 0.35892664092664095,
"calib/step_q_w_n": 777.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 512.16015625,
"completions/mean_terminated_length": 514.1686401367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.005053962115198374,
"kl": 0.125335693359375,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.005,
"num_tokens": 36678635.0,
"reward": 0.5785558223724365,
"reward_std": 0.2497299313545227,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.6750144362449646,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.20162850618362427,
"step": 152
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5855084636257915,
"calib/avg_num_step_conf": 5.1484375,
"calib/ece": 0.2675697211155379,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2350597609561753,
"calib/gap": 0.10179286729551623,
"calib/mean_conf": 0.5389641434262948,
"calib/mu_c": 0.583169014084507,
"calib/mu_w": 0.4813761467889908,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12039840637450201,
"calib/std_conf": 0.3531132493149983,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3921118045476536,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.06117746432507809,
"calib/step_q_w": 0.3309343402225755,
"calib/step_q_w_n": 629.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 557.41015625,
"completions/mean_terminated_length": 557.41015625,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.1632,
"grad_norm": 0.004702843725681305,
"kl": 0.1187286376953125,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0577,
"num_tokens": 36928652.0,
"reward": 0.5983327627182007,
"reward_std": 0.22896221280097961,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6617656350135803,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2286498248577118,
"step": 153
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6769323671497585,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.23519841269841268,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.24603174603174602,
"calib/gap": 0.23345156369183856,
"calib/mean_conf": 0.5424206349206349,
"calib/mu_c": 0.6702631578947371,
"calib/mu_w": 0.43681159420289856,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1626190476190476,
"calib/std_conf": 0.3671210572621941,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41268858800773695,
"calib/step_q_c_n": 517.0,
"calib/step_q_gap": 0.04542887577752114,
"calib/step_q_w": 0.3672597122302158,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 513.32421875,
"completions/mean_terminated_length": 513.32421875,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.16426666666666667,
"grad_norm": 42.83332824707031,
"kl": 424.1183166503906,
"learning_rate": 1.2777777777777779e-06,
"loss": 4.2855,
"num_tokens": 37164503.0,
"reward": 0.5859673023223877,
"reward_std": 0.23850896954536438,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7137206792831421,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.17227637767791748,
"step": 154
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6353689100800782,
"calib/avg_num_step_conf": 5.35546875,
"calib/ece": 0.261171875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.2578125,
"calib/gap": 0.17725900116144022,
"calib/mean_conf": 0.529453125,
"calib/mu_c": 0.6215447154471545,
"calib/mu_w": 0.4442857142857143,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.155078125,
"calib/std_conf": 0.3705766380625395,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3797499267935578,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": -0.014480063516519703,
"calib/step_q_w": 0.3942299903100775,
"calib/step_q_w_n": 688.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1181.0,
"completions/max_terminated_length": 1181.0,
"completions/mean_length": 469.2890625,
"completions/mean_terminated_length": 471.1294250488281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.005444099195301533,
"kl": 0.132110595703125,
"learning_rate": 1.25e-06,
"loss": 0.0071,
"num_tokens": 37391857.0,
"reward": 0.5964875817298889,
"reward_std": 0.23414337635040283,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6991492509841919,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.19773223996162415,
"step": 155
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7470858134920634,
"calib/avg_num_step_conf": 5.390625,
"calib/ece": 0.14956692913385827,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.23228346456692914,
"calib/gap": 0.3048710317460318,
"calib/mean_conf": 0.5268897637795276,
"calib/mu_c": 0.6781250000000001,
"calib/mu_w": 0.3732539682539683,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08625984251968505,
"calib/std_conf": 0.3619340160063211,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40273932253313693,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.05212591311801562,
"calib/step_q_w": 0.3506134094151213,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2337.0,
"completions/max_terminated_length": 2337.0,
"completions/mean_length": 524.4609375,
"completions/mean_terminated_length": 526.5177001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.1664,
"grad_norm": 0.004729734733700752,
"kl": 0.13568115234375,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0215,
"num_tokens": 37630879.0,
"reward": 0.6934912204742432,
"reward_std": 0.21397851407527924,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7648956775665283,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.32364919781684875,
"step": 156
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7214698963697628,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.16271653543307094,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.25984251968503935,
"calib/gap": 0.2603566660308984,
"calib/mean_conf": 0.6001181102362205,
"calib/mu_c": 0.709795918367347,
"calib/mu_w": 0.4494392523364486,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09204724409448822,
"calib/std_conf": 0.35050299321788464,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4152710872162485,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": 0.04565204776513354,
"calib/step_q_w": 0.36961903945111496,
"calib/step_q_w_n": 583.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 2512.0,
"completions/mean_length": 512.2890625,
"completions/mean_terminated_length": 514.298095703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.004471070598810911,
"kl": 0.134033203125,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0115,
"num_tokens": 37865753.0,
"reward": 0.6392801403999329,
"reward_std": 0.2263888418674469,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7539042830467224,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.21137472987174988,
"step": 157
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6468207718207718,
"calib/avg_num_step_conf": 5.31640625,
"calib/ece": 0.2062745098039216,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.30980392156862746,
"calib/gap": 0.17891414141414141,
"calib/mean_conf": 0.5980392156862745,
"calib/mu_c": 0.6675,
"calib/mu_w": 0.48858585858585857,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09627450980392163,
"calib/std_conf": 0.34570201158667146,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41464428457234215,
"calib/step_q_c_n": 834.0,
"calib/step_q_gap": 0.034258452820286556,
"calib/step_q_w": 0.3803858317520556,
"calib/step_q_w_n": 527.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1748.0,
"completions/max_terminated_length": 1748.0,
"completions/mean_length": 492.14453125,
"completions/mean_terminated_length": 494.07452392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.004758656956255436,
"kl": 0.15380859375,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0176,
"num_tokens": 38096982.0,
"reward": 0.6552407741546631,
"reward_std": 0.21743571758270264,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.721720278263092,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.26844868063926697,
"step": 158
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6775748351090817,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.20212598425196854,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2677165354330709,
"calib/gap": 0.21219558599695593,
"calib/mean_conf": 0.5766929133858267,
"calib/mu_c": 0.6669178082191781,
"calib/mu_w": 0.4547222222222222,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10200787401574807,
"calib/std_conf": 0.3529862595039625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4086933842239186,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.04906249737376567,
"calib/step_q_w": 0.35963088685015293,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2944.0,
"completions/max_terminated_length": 2944.0,
"completions/mean_length": 503.87890625,
"completions/mean_terminated_length": 503.87890625,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.1696,
"grad_norm": 0.004795048851519823,
"kl": 0.15203857421875,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0394,
"num_tokens": 38330759.0,
"reward": 0.6931055784225464,
"reward_std": 0.20757821202278137,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.728975772857666,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.34473535418510437,
"step": 159
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7036967418546366,
"calib/avg_num_step_conf": 5.484375,
"calib/ece": 0.20494071146245058,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.308300395256917,
"calib/gap": 0.2683734335839598,
"calib/mean_conf": 0.5654150197628458,
"calib/mu_c": 0.6927067669172932,
"calib/mu_w": 0.42433333333333334,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12233201581027668,
"calib/std_conf": 0.3729959966666498,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3919554131054131,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.0031860398860398265,
"calib/step_q_w": 0.38876937321937327,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 527.1875,
"completions/mean_terminated_length": 529.2549438476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.004639564547687769,
"kl": 0.1502838134765625,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0288,
"num_tokens": 38570559.0,
"reward": 0.6068136692047119,
"reward_std": 0.2425975650548935,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7312051057815552,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.1808597892522812,
"step": 160
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6673586829836828,
"calib/avg_num_step_conf": 5.53125,
"calib/ece": 0.20228346456692903,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.30708661417322836,
"calib/gap": 0.20857080419580404,
"calib/mean_conf": 0.6125984251968505,
"calib/mu_c": 0.6766477272727273,
"calib/mu_w": 0.4680769230769232,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06098425196850384,
"calib/std_conf": 0.35160198641064827,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3988762641898865,
"calib/step_q_c_n": 969.0,
"calib/step_q_gap": -0.03246601768930818,
"calib/step_q_w": 0.43134228187919466,
"calib/step_q_w_n": 447.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 2512.0,
"completions/mean_length": 488.07421875,
"completions/mean_terminated_length": 488.07421875,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.004807854071259499,
"kl": 0.146820068359375,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0451,
"num_tokens": 38799426.0,
"reward": 0.678764283657074,
"reward_std": 0.22198915481567383,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7400749921798706,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.28151601552963257,
"step": 161
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6792058516196448,
"calib/avg_num_step_conf": 5.265625,
"calib/ece": 0.18742063492063482,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.31746031746031744,
"calib/gap": 0.22275653082549635,
"calib/mean_conf": 0.6286111111111111,
"calib/mu_c": 0.7055151515151515,
"calib/mu_w": 0.4827586206896552,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08063492063492056,
"calib/std_conf": 0.35325590958159636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42484174311926604,
"calib/step_q_c_n": 872.0,
"calib/step_q_gap": 0.03172451622850969,
"calib/step_q_w": 0.39311722689075634,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 483.18359375,
"completions/mean_terminated_length": 483.18359375,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.1728,
"grad_norm": 0.005033192690461874,
"kl": 0.164642333984375,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0649,
"num_tokens": 39027265.0,
"reward": 0.6548143625259399,
"reward_std": 0.2401808500289917,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7367371320724487,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.24711044132709503,
"step": 162
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6913576327152655,
"calib/avg_num_step_conf": 5.87890625,
"calib/ece": 0.2013147410358566,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.24701195219123506,
"calib/gap": 0.2577959105918212,
"calib/mean_conf": 0.5337450199203189,
"calib/mu_c": 0.6611023622047245,
"calib/mu_w": 0.4033064516129033,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11454183266932269,
"calib/std_conf": 0.367372926678158,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4159095759233926,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.035744201246390106,
"calib/step_q_w": 0.3801653746770025,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2913.0,
"completions/max_terminated_length": 2913.0,
"completions/mean_length": 557.71484375,
"completions/mean_terminated_length": 557.71484375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.004345105495303869,
"kl": 0.148162841796875,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0341,
"num_tokens": 39274872.0,
"reward": 0.605945885181427,
"reward_std": 0.23104888200759888,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7246820330619812,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.19267842173576355,
"step": 163
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7164532206169958,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.18797619047619046,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.23015873015873015,
"calib/gap": 0.2715998990599963,
"calib/mean_conf": 0.5062301587301588,
"calib/mu_c": 0.6366412213740459,
"calib/mu_w": 0.36504132231404957,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08718253968253968,
"calib/std_conf": 0.3661203649486512,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3894722955145119,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.06453815775684474,
"calib/step_q_w": 0.32493413775766716,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1733.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 563.26953125,
"completions/mean_terminated_length": 565.4784545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.004327528644353151,
"kl": 0.1538543701171875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.014,
"num_tokens": 39525205.0,
"reward": 0.6413910388946533,
"reward_std": 0.22969061136245728,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.740004301071167,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2435591071844101,
"step": 164
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6714582823586982,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.21378906250000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.23828125,
"calib/gap": 0.2234732077318327,
"calib/mean_conf": 0.5459765625,
"calib/mu_c": 0.6629508196721312,
"calib/mu_w": 0.4394776119402985,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14160156250000003,
"calib/std_conf": 0.36246204466907095,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41703323485967503,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.018956477636734137,
"calib/step_q_w": 0.3980767572229409,
"calib/step_q_w_n": 773.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 540.61328125,
"completions/mean_terminated_length": 542.7333374023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.176,
"grad_norm": 0.0047814385034143925,
"kl": 0.159088134765625,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0187,
"num_tokens": 39769178.0,
"reward": 0.6197627782821655,
"reward_std": 0.2257610261440277,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7219374775886536,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.22305673360824585,
"step": 165
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7330788477577468,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.15984087301587302,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.23015873015873015,
"calib/gap": 0.3154504202219798,
"calib/mean_conf": 0.5316670634920636,
"calib/mu_c": 0.668111888111888,
"calib/mu_w": 0.35266146788990826,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.06202380952380954,
"calib/std_conf": 0.3670971265265847,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.40082370560547714,
"calib/step_q_c_n": 779.0,
"calib/step_q_gap": 0.04471630977214386,
"calib/step_q_w": 0.3561073958333333,
"calib/step_q_w_n": 640.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2072.0,
"completions/max_terminated_length": 2072.0,
"completions/mean_length": 541.83203125,
"completions/mean_terminated_length": 543.9569091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.004498959984630346,
"kl": 0.1609344482421875,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0236,
"num_tokens": 40014071.0,
"reward": 0.664745569229126,
"reward_std": 0.26040273904800415,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7511249780654907,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.2728974223136902,
"step": 166
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.559514687100894,
"calib/avg_num_step_conf": 5.59765625,
"calib/ece": 0.3013043478260869,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2845849802371542,
"calib/gap": 0.07738761174968056,
"calib/mean_conf": 0.5564822134387353,
"calib/mu_c": 0.5895172413793103,
"calib/mu_w": 0.5121296296296297,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14233201581027666,
"calib/std_conf": 0.3699501577900132,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3994137931034483,
"calib/step_q_c_n": 841.0,
"calib/step_q_gap": -0.020249551491146267,
"calib/step_q_w": 0.4196633445945946,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1781.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 538.44921875,
"completions/mean_terminated_length": 538.44921875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.0044858017936348915,
"kl": 0.15997314453125,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0062,
"num_tokens": 40257522.0,
"reward": 0.589515209197998,
"reward_std": 0.2210143506526947,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6483847498893738,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.21970805525779724,
"step": 167
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7101552795031056,
"calib/avg_num_step_conf": 5.453125,
"calib/ece": 0.19098039215686277,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.23137254901960785,
"calib/gap": 0.2726459627329192,
"calib/mean_conf": 0.5204705882352941,
"calib/mu_c": 0.6434285714285713,
"calib/mu_w": 0.37078260869565216,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08121568627450981,
"calib/std_conf": 0.37339220577166116,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4247226519337016,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.061910151933701585,
"calib/step_q_w": 0.36281250000000004,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1329.0,
"completions/max_terminated_length": 1329.0,
"completions/mean_length": 532.8984375,
"completions/mean_terminated_length": 534.98828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1792,
"grad_norm": 0.004624354187399149,
"kl": 0.161590576171875,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0188,
"num_tokens": 40498616.0,
"reward": 0.6443181037902832,
"reward_std": 0.20038428902626038,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7442601919174194,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.23578235507011414,
"step": 168
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6509328358208957,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.18314960629921248,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.25196850393700787,
"calib/gap": 0.20415796019900495,
"calib/mean_conf": 0.5636220472440945,
"calib/mu_c": 0.6600746268656716,
"calib/mu_w": 0.4559166666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10960629921259832,
"calib/std_conf": 0.34923238499102205,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43017673469387757,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.055638900657309365,
"calib/step_q_w": 0.3745378340365682,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2881.0,
"completions/max_terminated_length": 2881.0,
"completions/mean_length": 529.83984375,
"completions/mean_terminated_length": 531.9176635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.005059287417680025,
"kl": 0.16729736328125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0123,
"num_tokens": 40738439.0,
"reward": 0.6686310768127441,
"reward_std": 0.24430686235427856,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.715927004814148,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.3197726011276245,
"step": 169
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6846125186289121,
"calib/avg_num_step_conf": 5.71484375,
"calib/ece": 0.1486220472440945,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.28346456692913385,
"calib/gap": 0.2398559364133135,
"calib/mean_conf": 0.5755511811023621,
"calib/mu_c": 0.6907575757575758,
"calib/mu_w": 0.4509016393442623,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.102244094488189,
"calib/std_conf": 0.3417455932855625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4247980316645272,
"calib/step_q_c_n": 779.0,
"calib/step_q_gap": 0.0526141135358722,
"calib/step_q_w": 0.372183918128655,
"calib/step_q_w_n": 684.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 537.73046875,
"completions/mean_terminated_length": 537.73046875,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.004425270017236471,
"kl": 0.15838623046875,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0072,
"num_tokens": 40980250.0,
"reward": 0.6666654348373413,
"reward_std": 0.25412505865097046,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7407152652740479,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.29183441400527954,
"step": 170
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6623986735445836,
"calib/avg_num_step_conf": 5.76171875,
"calib/ece": 0.2222656249999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.23828125,
"calib/gap": 0.20489560304593468,
"calib/mean_conf": 0.526328125,
"calib/mu_c": 0.6367796610169492,
"calib/mu_w": 0.4318840579710145,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14382812499999997,
"calib/std_conf": 0.3605884320579133,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4277753472222222,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.010422296163691724,
"calib/step_q_w": 0.4173530510585305,
"calib/step_q_w_n": 803.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1227.0,
"completions/max_terminated_length": 1227.0,
"completions/mean_length": 507.859375,
"completions/mean_terminated_length": 509.85101318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1824,
"grad_norm": 0.004862784408032894,
"kl": 0.1643218994140625,
"learning_rate": 8.055555555555557e-07,
"loss": 0.0235,
"num_tokens": 41217158.0,
"reward": 0.6161177158355713,
"reward_std": 0.2292150855064392,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7190483808517456,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.22099950909614563,
"step": 171
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6468415937803693,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.1923809523809524,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.27380952380952384,
"calib/gap": 0.16925170068027195,
"calib/mean_conf": 0.6268253968253968,
"calib/mu_c": 0.6973469387755101,
"calib/mu_w": 0.5280952380952382,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11793650793650795,
"calib/std_conf": 0.32725566626268365,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4019522102747909,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": -0.008498770117365995,
"calib/step_q_w": 0.4104509803921569,
"calib/step_q_w_n": 578.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2931.0,
"completions/max_terminated_length": 2931.0,
"completions/mean_length": 513.92578125,
"completions/mean_terminated_length": 515.9412231445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.004686696920543909,
"kl": 0.169036865234375,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0545,
"num_tokens": 41452075.0,
"reward": 0.6795635223388672,
"reward_std": 0.25730636715888977,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.714409351348877,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.33456137776374817,
"step": 172
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6440162271805274,
"calib/avg_num_step_conf": 5.87890625,
"calib/ece": 0.20185119047619055,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": 0.20437905679513196,
"calib/mean_conf": 0.5958472222222223,
"calib/mu_c": 0.6899264705882354,
"calib/mu_w": 0.48554741379310346,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12900793650793657,
"calib/std_conf": 0.3517430502018481,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.426232951653944,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.08086007265533479,
"calib/step_q_w": 0.3453728789986092,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1909.0,
"completions/max_terminated_length": 1909.0,
"completions/mean_length": 561.88671875,
"completions/mean_terminated_length": 564.0902099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.004296320024877787,
"kl": 0.1624908447265625,
"learning_rate": 7.5e-07,
"loss": 0.0045,
"num_tokens": 41699078.0,
"reward": 0.5957044363021851,
"reward_std": 0.2674137055873871,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7136157155036926,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.1754494607448578,
"step": 173
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5652980132450331,
"calib/avg_num_step_conf": 6.140625,
"calib/ece": 0.2896573705179283,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.21912350597609562,
"calib/gap": 0.07488543046357615,
"calib/mean_conf": 0.48364940239043824,
"calib/mu_c": 0.5287,
"calib/mu_w": 0.4538145695364238,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18745019920318723,
"calib/std_conf": 0.36043551095779086,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4055314183123878,
"calib/step_q_c_n": 557.0,
"calib/step_q_gap": 0.02268074507757667,
"calib/step_q_w": 0.38285067323481115,
"calib/step_q_w_n": 1015.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2551.0,
"completions/max_terminated_length": 2551.0,
"completions/mean_length": 611.9375,
"completions/mean_terminated_length": 614.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1856,
"grad_norm": 0.004755265079438686,
"kl": 0.1500396728515625,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0174,
"num_tokens": 41959966.0,
"reward": 0.5307010412216187,
"reward_std": 0.25636354088783264,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.6387794613838196,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.1499662697315216,
"step": 174
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6603300330033004,
"calib/avg_num_step_conf": 6.05859375,
"calib/ece": 0.23573705179282872,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.1752988047808765,
"calib/gap": 0.20679207920792086,
"calib/mean_conf": 0.487211155378486,
"calib/mu_c": 0.6107920792079208,
"calib/mu_w": 0.4039999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16027888446215144,
"calib/std_conf": 0.35651057777686285,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4259963099630996,
"calib/step_q_c_n": 542.0,
"calib/step_q_gap": 0.0851025868048571,
"calib/step_q_w": 0.3408937231582425,
"calib/step_q_w_n": 1009.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2187.0,
"completions/max_terminated_length": 2187.0,
"completions/mean_length": 558.203125,
"completions/mean_terminated_length": 560.3922119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.004802222829312086,
"kl": 0.1537017822265625,
"learning_rate": 6.944444444444446e-07,
"loss": 0.0235,
"num_tokens": 42208690.0,
"reward": 0.5531376600265503,
"reward_std": 0.22892574965953827,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.7105348110198975,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.1207406222820282,
"step": 175
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6774114774114773,
"calib/avg_num_step_conf": 5.90234375,
"calib/ece": 0.21753906250000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.22265625,
"calib/gap": 0.22457020757020763,
"calib/mean_conf": 0.5219921875,
"calib/mu_c": 0.6360317460317461,
"calib/mu_w": 0.41146153846153843,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12367187499999999,
"calib/std_conf": 0.35720422633832993,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.447039908045977,
"calib/step_q_c_n": 725.0,
"calib/step_q_gap": 0.05842192670585833,
"calib/step_q_w": 0.38861798134011866,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1570.0,
"completions/max_terminated_length": 1570.0,
"completions/mean_length": 521.58984375,
"completions/mean_terminated_length": 523.6353149414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.004365100525319576,
"kl": 0.161163330078125,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0072,
"num_tokens": 42446281.0,
"reward": 0.6350507140159607,
"reward_std": 0.23502111434936523,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7338355183601379,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.23782828450202942,
"step": 176
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6791389680278569,
"calib/avg_num_step_conf": 5.734375,
"calib/ece": 0.20876984126984127,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": 0.23434188034188025,
"calib/mean_conf": 0.5348015873015873,
"calib/mu_c": 0.6603418803418802,
"calib/mu_w": 0.426,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13964285714285715,
"calib/std_conf": 0.3649031730634391,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4506144356955381,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.07512836126576622,
"calib/step_q_w": 0.3754860744297719,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 541.6171875,
"completions/mean_terminated_length": 543.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.1888,
"grad_norm": 0.004700418561697006,
"kl": 0.1542205810546875,
"learning_rate": 6.388888888888889e-07,
"loss": 0.0144,
"num_tokens": 42688767.0,
"reward": 0.6129065155982971,
"reward_std": 0.2510068416595459,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7183199524879456,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.21921183168888092,
"step": 177
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6999378689033862,
"calib/avg_num_step_conf": 5.7890625,
"calib/ece": 0.16547265625000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.21484375,
"calib/gap": 0.2520751164958063,
"calib/mean_conf": 0.53007421875,
"calib/mu_c": 0.6393724137931035,
"calib/mu_w": 0.3872972972972972,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06457031249999999,
"calib/std_conf": 0.35354642767651767,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4334288095238095,
"calib/step_q_c_n": 840.0,
"calib/step_q_gap": 0.07878654576472333,
"calib/step_q_w": 0.3546422637590862,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1346.0,
"completions/max_terminated_length": 1346.0,
"completions/mean_length": 501.609375,
"completions/mean_terminated_length": 503.5765075683594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.004584399983286858,
"kl": 0.1641693115234375,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0016,
"num_tokens": 42923251.0,
"reward": 0.6758027076721191,
"reward_std": 0.22934101521968842,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7480090260505676,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.2910963296890259,
"step": 178
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6706262807377048,
"calib/avg_num_step_conf": 5.80078125,
"calib/ece": 0.19084,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.204,
"calib/gap": 0.19332351434426215,
"calib/mean_conf": 0.5530799999999999,
"calib/mu_c": 0.6474218749999999,
"calib/mu_w": 0.4540983606557378,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11595999999999998,
"calib/std_conf": 0.3392670240385882,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4167318361955086,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.04583897905265144,
"calib/step_q_w": 0.37089285714285714,
"calib/step_q_w_n": 728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 560.765625,
"completions/mean_terminated_length": 560.765625,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.004369755275547504,
"kl": 0.162078857421875,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0654,
"num_tokens": 43173071.0,
"reward": 0.6135045289993286,
"reward_std": 0.2431424856185913,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7089457511901855,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.22353218495845795,
"step": 179
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6722066604419545,
"calib/avg_num_step_conf": 5.73046875,
"calib/ece": 0.18620734908136477,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2125984251968504,
"calib/gap": 0.22150368295466327,
"calib/mean_conf": 0.550249343832021,
"calib/mu_c": 0.6540246913580247,
"calib/mu_w": 0.4325210084033614,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10248031496062987,
"calib/std_conf": 0.34653134212289904,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41154993514915694,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.016650030934597604,
"calib/step_q_w": 0.39489990421455934,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 583.76171875,
"completions/mean_terminated_length": 586.051025390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.192,
"grad_norm": 0.00433363439515233,
"kl": 0.15771484375,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0156,
"num_tokens": 43426370.0,
"reward": 0.6204407215118408,
"reward_std": 0.229685440659523,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7350806593894958,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.20189449191093445,
"step": 180
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7165070242656449,
"calib/avg_num_step_conf": 5.98828125,
"calib/ece": 0.141897233201581,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.17786561264822134,
"calib/gap": 0.25479118773946385,
"calib/mean_conf": 0.508695652173913,
"calib/mu_c": 0.6547222222222224,
"calib/mu_w": 0.3999310344827586,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11185770750988143,
"calib/std_conf": 0.33311602012883124,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4376819078947368,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.06188947546230439,
"calib/step_q_w": 0.37579243243243243,
"calib/step_q_w_n": 925.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2604.0,
"completions/max_terminated_length": 2604.0,
"completions/mean_length": 522.85546875,
"completions/mean_terminated_length": 522.85546875,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.08254922181367874,
"kl": 1.1282501220703125,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0742,
"num_tokens": 43666485.0,
"reward": 0.616002082824707,
"reward_std": 0.24907004833221436,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.7522082328796387,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.1985459327697754,
"step": 181
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6683222958057395,
"calib/avg_num_step_conf": 5.58984375,
"calib/ece": 0.17320158102766797,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.17786561264822134,
"calib/gap": 0.1990053239838982,
"calib/mean_conf": 0.5536758893280633,
"calib/mu_c": 0.6339072847682119,
"calib/mu_w": 0.4349019607843137,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06501976284584982,
"calib/std_conf": 0.3407958682349704,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41393501805054156,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.05379612916165272,
"calib/step_q_w": 0.36013888888888884,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1441.0,
"completions/max_terminated_length": 1441.0,
"completions/mean_length": 523.234375,
"completions/mean_terminated_length": 525.2863159179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.004902321379631758,
"kl": 0.167877197265625,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0075,
"num_tokens": 43906593.0,
"reward": 0.671410858631134,
"reward_std": 0.2326556146144867,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7256957292556763,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.30228227376937866,
"step": 182
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6664479440069991,
"calib/avg_num_step_conf": 5.48046875,
"calib/ece": 0.21691699604743084,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.1857707509881423,
"calib/gap": 0.20984876890388693,
"calib/mean_conf": 0.4854545454545454,
"calib/mu_c": 0.5907936507936506,
"calib/mu_w": 0.3809448818897637,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10217391304347825,
"calib/std_conf": 0.36479065807455824,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43924961597542245,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.0876812205853515,
"calib/step_q_w": 0.35156839539007095,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1870.0,
"completions/max_terminated_length": 1870.0,
"completions/mean_length": 536.94140625,
"completions/mean_terminated_length": 541.1693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.1952,
"grad_norm": 0.004713596776127815,
"kl": 0.1636810302734375,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0155,
"num_tokens": 44150730.0,
"reward": 0.6190246939659119,
"reward_std": 0.19719631969928741,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7093328237533569,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2334040105342865,
"step": 183
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.609497694129378,
"calib/avg_num_step_conf": 5.50390625,
"calib/ece": 0.2241094117647059,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.20392156862745098,
"calib/gap": 0.1496872616228344,
"calib/mean_conf": 0.5224003921568627,
"calib/mu_c": 0.5887323943661973,
"calib/mu_w": 0.4390451327433629,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09482352941176468,
"calib/std_conf": 0.3562133593401506,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4167713058419244,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.009820278985684328,
"calib/step_q_w": 0.4069510268562401,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1481.0,
"completions/max_terminated_length": 1481.0,
"completions/mean_length": 524.9140625,
"completions/mean_terminated_length": 526.9725952148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.004596102982759476,
"kl": 0.1666717529296875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0068,
"num_tokens": 44390388.0,
"reward": 0.614298939704895,
"reward_std": 0.2608785331249237,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6963027715682983,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.22213885188102722,
"step": 184
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7102599268547544,
"calib/avg_num_step_conf": 6.01171875,
"calib/ece": 0.1720161290322581,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.27419354838709675,
"calib/gap": 0.2664472309299896,
"calib/mean_conf": 0.5761290322580644,
"calib/mu_c": 0.7007575757575758,
"calib/mu_w": 0.4343103448275862,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10794354838709683,
"calib/std_conf": 0.36060032758429683,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42642047026279395,
"calib/step_q_c_n": 723.0,
"calib/step_q_gap": 0.08244007810593124,
"calib/step_q_w": 0.3439803921568627,
"calib/step_q_w_n": 816.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 549.328125,
"completions/mean_terminated_length": 551.4823608398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.004440247546881437,
"kl": 0.1527557373046875,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0455,
"num_tokens": 44637936.0,
"reward": 0.5938361287117004,
"reward_std": 0.23641090095043182,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7243554592132568,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.16644182801246643,
"step": 185
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6337535014005603,
"calib/avg_num_step_conf": 5.8828125,
"calib/ece": 0.18752988047808766,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.23107569721115537,
"calib/gap": 0.17142793481028779,
"calib/mean_conf": 0.5533466135458168,
"calib/mu_c": 0.6346212121212121,
"calib/mu_w": 0.46319327731092436,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10749003984063746,
"calib/std_conf": 0.3492233206242315,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.420696370967742,
"calib/step_q_c_n": 744.0,
"calib/step_q_gap": 0.04485267018034034,
"calib/step_q_w": 0.37584370078740165,
"calib/step_q_w_n": 762.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2601.0,
"completions/max_terminated_length": 2601.0,
"completions/mean_length": 548.3515625,
"completions/mean_terminated_length": 550.5020141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.1984,
"grad_norm": 0.0045608277432620525,
"kl": 0.1638031005859375,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0158,
"num_tokens": 44883354.0,
"reward": 0.596014142036438,
"reward_std": 0.24676115810871124,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6918773651123047,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.20249465107917786,
"step": 186
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6417312661498709,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.24040160642570285,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.26104417670682734,
"calib/gap": 0.18380813953488379,
"calib/mean_conf": 0.5526907630522089,
"calib/mu_c": 0.6479166666666667,
"calib/mu_w": 0.4641085271317829,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15558232931726912,
"calib/std_conf": 0.35854512572862435,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3941972101972102,
"calib/step_q_c_n": 693.0,
"calib/step_q_gap": 0.028360524262720788,
"calib/step_q_w": 0.3658366859344894,
"calib/step_q_w_n": 865.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2515.0,
"completions/max_terminated_length": 2515.0,
"completions/mean_length": 558.8984375,
"completions/mean_terminated_length": 561.0902099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.004232458770275116,
"kl": 0.167236328125,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0541,
"num_tokens": 45127976.0,
"reward": 0.6228945851325989,
"reward_std": 0.23268939554691315,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6891741752624512,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.2675524652004242,
"step": 187
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6111542443064182,
"calib/avg_num_step_conf": 5.90234375,
"calib/ece": 0.22211999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.256,
"calib/gap": 0.14758928571428576,
"calib/mean_conf": 0.58388,
"calib/mu_c": 0.65,
"calib/mu_w": 0.5024107142857143,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.127,
"calib/std_conf": 0.3454801667245169,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.422124347826087,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.05691429116886321,
"calib/step_q_w": 0.3652100566572238,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2363.0,
"completions/max_terminated_length": 2363.0,
"completions/mean_length": 570.59375,
"completions/mean_terminated_length": 577.3596801757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.004320810083299875,
"kl": 0.1486968994140625,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0141,
"num_tokens": 45378120.0,
"reward": 0.6399295330047607,
"reward_std": 0.2552989721298218,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6887964606285095,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.287156343460083,
"step": 188
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7073451602863368,
"calib/avg_num_step_conf": 5.75390625,
"calib/ece": 0.17826771653543305,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2283464566929134,
"calib/gap": 0.2525477746654217,
"calib/mean_conf": 0.5096062992125984,
"calib/mu_c": 0.627925925925926,
"calib/mu_w": 0.3753781512605043,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07818897637795273,
"calib/std_conf": 0.3587651647315927,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40935324200913237,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.086860061210568,
"calib/step_q_w": 0.32249318079856437,
"calib/step_q_w_n": 743.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2301.0,
"completions/max_terminated_length": 2301.0,
"completions/mean_length": 544.44921875,
"completions/mean_terminated_length": 544.44921875,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"epoch": 0.2016,
"grad_norm": 0.004429060034453869,
"kl": 0.1707916259765625,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0343,
"num_tokens": 45625267.0,
"reward": 0.6407199501991272,
"reward_std": 0.21337494254112244,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7381328344345093,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2401820719242096,
"step": 189
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7381419175309891,
"calib/avg_num_step_conf": 5.9609375,
"calib/ece": 0.14444444444444443,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.21031746031746032,
"calib/gap": 0.3039615481912472,
"calib/mean_conf": 0.510952380952381,
"calib/mu_c": 0.6532835820895523,
"calib/mu_w": 0.3493220338983051,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06182539682539681,
"calib/std_conf": 0.3595774163755935,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4141890166028097,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.035158061017345366,
"calib/step_q_w": 0.37903095558546435,
"calib/step_q_w_n": 743.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2660.0,
"completions/max_terminated_length": 2660.0,
"completions/mean_length": 580.78515625,
"completions/mean_terminated_length": 583.0628051757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.004130993504077196,
"kl": 0.1551055908203125,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0263,
"num_tokens": 45879556.0,
"reward": 0.6636803150177002,
"reward_std": 0.24022971093654633,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.760574996471405,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.26522326469421387,
"step": 190
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7302767052767052,
"calib/avg_num_step_conf": 6.11328125,
"calib/ece": 0.17864541832669323,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.30677290836653387,
"calib/gap": 0.3050456885456885,
"calib/mean_conf": 0.5969721115537848,
"calib/mu_c": 0.767117117117117,
"calib/mu_w": 0.4620714285714286,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16669322709163348,
"calib/std_conf": 0.3553310493057434,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4486068111455109,
"calib/step_q_c_n": 646.0,
"calib/step_q_gap": 0.09276459134137593,
"calib/step_q_w": 0.35584221980413494,
"calib/step_q_w_n": 919.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2223.0,
"completions/max_terminated_length": 2223.0,
"completions/mean_length": 571.17578125,
"completions/mean_terminated_length": 571.17578125,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.00448433356359601,
"kl": 0.1585540771484375,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0019,
"num_tokens": 46129945.0,
"reward": 0.5969818830490112,
"reward_std": 0.2233514040708542,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.7355577945709229,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.17637458443641663,
"step": 191
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7258225324027916,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.16803149606299206,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.20866141732283464,
"calib/gap": 0.28928339980059814,
"calib/mean_conf": 0.5285826771653543,
"calib/mu_c": 0.6834745762711864,
"calib/mu_w": 0.3941911764705882,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11602362204724402,
"calib/std_conf": 0.36059601669870744,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43136098654708527,
"calib/step_q_c_n": 669.0,
"calib/step_q_gap": 0.05121139750598935,
"calib/step_q_w": 0.3801495890410959,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 550.34765625,
"completions/mean_terminated_length": 550.34765625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.2048,
"grad_norm": 0.004605600144714117,
"kl": 0.1724395751953125,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0644,
"num_tokens": 46375810.0,
"reward": 0.6355305910110474,
"reward_std": 0.22681453824043274,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7550976276397705,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.22533850371837616,
"step": 192
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6440483036227717,
"calib/avg_num_step_conf": 5.71484375,
"calib/ece": 0.2073968253968254,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.1984126984126984,
"calib/gap": 0.1773064979873491,
"calib/mean_conf": 0.48866666666666664,
"calib/mu_c": 0.5878738738738739,
"calib/mu_w": 0.4105673758865248,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12779365079365085,
"calib/std_conf": 0.3521201544566899,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40478499210110586,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.037604269209539576,
"calib/step_q_w": 0.3671807228915663,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1475.0,
"completions/max_terminated_length": 1475.0,
"completions/mean_length": 533.0546875,
"completions/mean_terminated_length": 535.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.004652594216167927,
"kl": 0.166473388671875,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0153,
"num_tokens": 46617984.0,
"reward": 0.6170186996459961,
"reward_std": 0.25677111744880676,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7034628391265869,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.24619954824447632,
"step": 193
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7412958480729845,
"calib/avg_num_step_conf": 5.98046875,
"calib/ece": 0.1687401574803149,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.22440944881889763,
"calib/gap": 0.3154831502513498,
"calib/mean_conf": 0.5441732283464566,
"calib/mu_c": 0.6969465648854961,
"calib/mu_w": 0.3814634146341463,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09858267716535431,
"calib/std_conf": 0.3682592397072139,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40561673202614373,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.04993387736992094,
"calib/step_q_w": 0.3556828546562228,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 525.4140625,
"completions/mean_terminated_length": 525.4140625,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.004900243133306503,
"kl": 0.1553802490234375,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0207,
"num_tokens": 46858434.0,
"reward": 0.6249498724937439,
"reward_std": 0.22597366571426392,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7653836011886597,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.18373483419418335,
"step": 194
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6280734516028634,
"calib/avg_num_step_conf": 5.71875,
"calib/ece": 0.24606299212598423,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.1968503937007874,
"calib/gap": 0.15394211017740417,
"calib/mean_conf": 0.49456692913385825,
"calib/mu_c": 0.5763865546218486,
"calib/mu_w": 0.42244444444444446,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13606299212598424,
"calib/std_conf": 0.35563352787724095,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41319080291970806,
"calib/step_q_c_n": 685.0,
"calib/step_q_gap": 0.029657683535882662,
"calib/step_q_w": 0.3835331193838254,
"calib/step_q_w_n": 779.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1285.0,
"completions/max_terminated_length": 1285.0,
"completions/mean_length": 512.2421875,
"completions/mean_terminated_length": 514.2510375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.208,
"grad_norm": 0.005007847677916288,
"kl": 0.173828125,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0025,
"num_tokens": 47095552.0,
"reward": 0.6037704944610596,
"reward_std": 0.20243516564369202,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6950304508209229,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.221104234457016,
"step": 195
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6890919158361017,
"calib/avg_num_step_conf": 5.43359375,
"calib/ece": 0.17752941176470585,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.30980392156862746,
"calib/gap": 0.2530952380952382,
"calib/mean_conf": 0.6149411764705882,
"calib/mu_c": 0.7400000000000001,
"calib/mu_w": 0.4869047619047619,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14329411764705882,
"calib/std_conf": 0.34986705664309997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4632410886319847,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.03366550446412592,
"calib/step_q_w": 0.4295755841678588,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2328.0,
"completions/max_terminated_length": 2328.0,
"completions/mean_length": 464.1953125,
"completions/mean_terminated_length": 464.1953125,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.004955775570124388,
"kl": 0.1866455078125,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0534,
"num_tokens": 47316930.0,
"reward": 0.638756275177002,
"reward_std": 0.2346639633178711,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7393644452095032,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.2381480634212494,
"step": 196
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6842257318952234,
"calib/avg_num_step_conf": 5.91015625,
"calib/ece": 0.1882399999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.18,
"calib/gap": 0.22706728299948636,
"calib/mean_conf": 0.5196000000000001,
"calib/mu_c": 0.6394915254237288,
"calib/mu_w": 0.4124242424242424,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11791999999999993,
"calib/std_conf": 0.34875297848190484,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42947526236881556,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.051351030690328536,
"calib/step_q_w": 0.378124231678487,
"calib/step_q_w_n": 846.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 567.01171875,
"completions/mean_terminated_length": 573.7352294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.004286373499780893,
"kl": 0.16259765625,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0174,
"num_tokens": 47567141.0,
"reward": 0.6364597082138062,
"reward_std": 0.2432287335395813,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7226440906524658,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.26355651021003723,
"step": 197
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7110732009925559,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.1606299212598425,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.23622047244094488,
"calib/gap": 0.27094044665012407,
"calib/mean_conf": 0.5249606299212598,
"calib/mu_c": 0.6572307692307692,
"calib/mu_w": 0.3862903225806451,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08688976377952755,
"calib/std_conf": 0.3550953604110169,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41581380563124426,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.02308978877731166,
"calib/step_q_w": 0.3927240168539326,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2404.0,
"completions/max_terminated_length": 2404.0,
"completions/mean_length": 496.15234375,
"completions/mean_terminated_length": 498.09808349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.2112,
"grad_norm": 0.005007683299481869,
"kl": 0.168792724609375,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0305,
"num_tokens": 47799540.0,
"reward": 0.6415027976036072,
"reward_std": 0.23499776422977448,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7438390254974365,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.2415103018283844,
"step": 198
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.663230457880091,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.2093253968253968,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.29365079365079366,
"calib/gap": 0.21554389071591207,
"calib/mean_conf": 0.5963095238095237,
"calib/mu_c": 0.6972388059701493,
"calib/mu_w": 0.48169491525423724,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13694444444444442,
"calib/std_conf": 0.3578100712000072,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4378330110497237,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.06646610457490354,
"calib/step_q_w": 0.3713669064748202,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 552.94140625,
"completions/mean_terminated_length": 555.10986328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.0044085741974413395,
"kl": 0.1686553955078125,
"learning_rate": 2.777777777777778e-08,
"loss": 0.0059,
"num_tokens": 48045293.0,
"reward": 0.6249779462814331,
"reward_std": 0.28111791610717773,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7144218683242798,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.23475277423858643,
"step": 199
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7524773937817416,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.11980392156862743,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2235294117647059,
"calib/gap": 0.30327015979189903,
"calib/mean_conf": 0.5554901960784313,
"calib/mu_c": 0.6946376811594204,
"calib/mu_w": 0.39136752136752134,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06705882352941178,
"calib/std_conf": 0.34173211458564623,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43393265398550723,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.023146218199071533,
"calib/step_q_w": 0.4107864357864357,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1414.0,
"completions/max_terminated_length": 1414.0,
"completions/mean_length": 537.23828125,
"completions/mean_terminated_length": 539.3451538085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.004761539865285158,
"kl": 0.1644439697265625,
"learning_rate": 0.0,
"loss": 0.0154,
"num_tokens": 48290874.0,
"reward": 0.7025998830795288,
"reward_std": 0.21370618045330048,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.782248854637146,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.31591975688934326,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.03833882060367614,
"train_runtime": 12677.532,
"train_samples_per_second": 4.039,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 48290874,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}