Files
PureRL-1.5B-v5-06-uppl/trainer_state.json
ModelHub XC 03571b91a1 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v5-06-uppl
Source: Original Platform
2026-06-05 06:43:18 +08:00

8641 lines
338 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.6944444444444445,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.6230769230769231,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.03861111111111115,
"calib/mean_conf": 0.9307692307692309,
"calib/mu_c": 0.9575,
"calib/mu_w": 0.9188888888888889,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.6230769230769231,
"calib/std_conf": 0.07965903671384378,
"calib/step_conf_rate": 0.0703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2955.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 613.67578125,
"completions/mean_terminated_length": 674.2532348632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.004048487171530724,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0322,
"num_tokens": 264685.0,
"reward": 0.055236753076314926,
"reward_std": 0.11281141638755798,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.01655624993145466,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.024703249335289,
"step": 1
},
{
"calib/answer_extract_rate": 0.13671875,
"calib/auroc": 0.5338345864661654,
"calib/avg_num_step_conf": 0.55078125,
"calib/ece": 0.6261538461538463,
"calib/final_conf_rate": 0.1015625,
"calib/format_rate": 0.08984375,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.002406015037593856,
"calib/mean_conf": 0.8953846153846153,
"calib/mu_c": 0.897142857142857,
"calib/mu_w": 0.8947368421052632,
"calib/nonempty_final_conf_rate": 0.1015625,
"calib/nonempty_reasoning_rate": 0.14453125,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.6261538461538463,
"calib/std_conf": 0.18653172073466937,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 646.4609375,
"completions/mean_terminated_length": 683.8594970703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.006200637202709913,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0643,
"num_tokens": 533467.0,
"reward": 0.11156807839870453,
"reward_std": 0.21452845633029938,
"rewards/accuracy_reward_step": 0.03125,
"rewards/final_brier_reward_step": 0.02965039201080799,
"rewards/format_reward_step": 0.08984375,
"rewards/stepwise_brier_reward": 0.04943438619375229,
"step": 2
},
{
"calib/answer_extract_rate": 0.05859375,
"calib/auroc": 0.55,
"calib/avg_num_step_conf": 0.26171875,
"calib/ece": 0.7683333333333333,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.9166666666666666,
"calib/gap": 0.04800000000000004,
"calib/mean_conf": 0.935,
"calib/mu_c": 0.975,
"calib/mu_w": 0.9269999999999999,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.08203125,
"calib/nonempty_step_conf_rate": 0.0546875,
"calib/pce": 0.7683333333333333,
"calib/std_conf": 0.1321299865031906,
"calib/step_conf_rate": 0.0546875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 637.359375,
"completions/mean_terminated_length": 703.2930908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0032,
"grad_norm": 0.003514436539262533,
"learning_rate": 7.5e-07,
"loss": 0.0066,
"num_tokens": 801887.0,
"reward": 0.030613092705607414,
"reward_std": 0.07494865357875824,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.005057031288743019,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.01583283767104149,
"step": 3
},
{
"calib/answer_extract_rate": 0.0625,
"calib/auroc": 0.55,
"calib/avg_num_step_conf": 0.3046875,
"calib/ece": 0.7733333333333331,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.03125,
"calib/frac_conf_gt_0.9": 0.9166666666666666,
"calib/gap": 0.02400000000000002,
"calib/mean_conf": 0.9400000000000001,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9359999999999999,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.078125,
"calib/nonempty_step_conf_rate": 0.0625,
"calib/pce": 0.7733333333333331,
"calib/std_conf": 0.07291547618075786,
"calib/step_conf_rate": 0.0625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 701.19921875,
"completions/mean_terminated_length": 773.737060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.003428457770496607,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0312,
"num_tokens": 1087562.0,
"reward": 0.0333736427128315,
"reward_std": 0.08957314491271973,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.00750117190182209,
"rewards/format_reward_step": 0.03125,
"rewards/stepwise_brier_reward": 0.0166183989495039,
"step": 4
},
{
"calib/answer_extract_rate": 0.06640625,
"calib/avg_num_step_conf": 0.234375,
"calib/ece": 0.8066666666666666,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.02734375,
"calib/frac_conf_gt_0.9": 0.5555555555555556,
"calib/mean_conf": 0.8066666666666665,
"calib/mu_c": NaN,
"calib/mu_w": 0.8066666666666665,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.08203125,
"calib/nonempty_step_conf_rate": 0.04296875,
"calib/pce": 0.8066666666666666,
"calib/std_conf": 0.2680795901717747,
"calib/step_conf_rate": 0.04296875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 679.66015625,
"completions/mean_terminated_length": 737.2584838867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.0035970297176390886,
"learning_rate": 1.25e-06,
"loss": 0.0245,
"num_tokens": 1368243.0,
"reward": 0.019157392904162407,
"reward_std": 0.05418529361486435,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.006841015536338091,
"rewards/format_reward_step": 0.02734375,
"rewards/stepwise_brier_reward": 0.015101059339940548,
"step": 5
},
{
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.7222222222222222,
"calib/avg_num_step_conf": 0.45703125,
"calib/ece": 0.7522277777777776,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.046875,
"calib/frac_conf_gt_0.9": 0.9444444444444444,
"calib/gap": 0.07732666666666665,
"calib/mean_conf": 0.9188944444444443,
"calib/mu_c": 0.9833333333333334,
"calib/mu_w": 0.9060066666666667,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.1171875,
"calib/nonempty_step_conf_rate": 0.0859375,
"calib/pce": 0.7522277777777776,
"calib/std_conf": 0.2212836301427103,
"calib/step_conf_rate": 0.0859375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 647.62890625,
"completions/mean_terminated_length": 687.9378051757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 0.005158636253327131,
"learning_rate": 1.5e-06,
"loss": 0.0173,
"num_tokens": 1639988.0,
"reward": 0.057144567370414734,
"reward_std": 0.14538687467575073,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.013710929080843925,
"rewards/format_reward_step": 0.046875,
"rewards/stepwise_brier_reward": 0.027367327362298965,
"step": 6
},
{
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.33333333333333337,
"calib/avg_num_step_conf": 0.359375,
"calib/ece": 0.6594736842105262,
"calib/final_conf_rate": 0.07421875,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.6842105263157895,
"calib/gap": -0.032916666666666816,
"calib/mean_conf": 0.8110526315789474,
"calib/mu_c": 0.7833333333333332,
"calib/mu_w": 0.81625,
"calib/nonempty_final_conf_rate": 0.07421875,
"calib/nonempty_reasoning_rate": 0.09375,
"calib/nonempty_step_conf_rate": 0.07421875,
"calib/pce": 0.6563157894736841,
"calib/std_conf": 0.2555673716272154,
"calib/step_conf_rate": 0.07421875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 733.5,
"completions/mean_terminated_length": 788.974853515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.004818637389689684,
"learning_rate": 1.75e-06,
"loss": 0.0298,
"num_tokens": 1935188.0,
"reward": 0.061079807579517365,
"reward_std": 0.12600858509540558,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.020985547453165054,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.035833682864904404,
"step": 7
},
{
"calib/answer_extract_rate": 0.0859375,
"calib/auroc": 0.6333333333333333,
"calib/avg_num_step_conf": 0.3515625,
"calib/ece": 0.7822222222222222,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.05078125,
"calib/frac_conf_gt_0.9": 0.8888888888888888,
"calib/gap": -0.018666666666667053,
"calib/mean_conf": 0.948888888888889,
"calib/mu_c": 0.9333333333333332,
"calib/mu_w": 0.9520000000000003,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.09765625,
"calib/nonempty_step_conf_rate": 0.078125,
"calib/pce": 0.7822222222222222,
"calib/std_conf": 0.05054029073575181,
"calib/step_conf_rate": 0.078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2914.0,
"completions/max_terminated_length": 2914.0,
"completions/mean_length": 638.765625,
"completions/mean_terminated_length": 701.8197631835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.004740505013614893,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0314,
"num_tokens": 2205224.0,
"reward": 0.059289030730724335,
"reward_std": 0.12507638335227966,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.01568320393562317,
"rewards/format_reward_step": 0.05078125,
"rewards/stepwise_brier_reward": 0.026160426437854767,
"step": 8
},
{
"calib/answer_extract_rate": 0.08203125,
"calib/auroc": 0.6666666666666666,
"calib/avg_num_step_conf": 0.18359375,
"calib/ece": 0.7,
"calib/final_conf_rate": 0.05859375,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.7333333333333333,
"calib/gap": 0.07500000000000018,
"calib/mean_conf": 0.9,
"calib/mu_c": 0.96,
"calib/mu_w": 0.8849999999999998,
"calib/nonempty_final_conf_rate": 0.05859375,
"calib/nonempty_reasoning_rate": 0.08984375,
"calib/nonempty_step_conf_rate": 0.046875,
"calib/pce": 0.7,
"calib/std_conf": 0.11366617790706256,
"calib/step_conf_rate": 0.046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 577.609375,
"completions/mean_terminated_length": 645.7117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0096,
"grad_norm": 0.004966007545590401,
"learning_rate": 2.25e-06,
"loss": 0.0356,
"num_tokens": 2460628.0,
"reward": 0.04672419652342796,
"reward_std": 0.10924308001995087,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.014699999243021011,
"rewards/format_reward_step": 0.0390625,
"rewards/stepwise_brier_reward": 0.023759275674819946,
"step": 9
},
{
"calib/answer_extract_rate": 0.14453125,
"calib/auroc": 0.8888888888888888,
"calib/avg_num_step_conf": 0.546875,
"calib/ece": 0.874642857142857,
"calib/final_conf_rate": 0.109375,
"calib/format_rate": 0.09375,
"calib/frac_conf_gt_0.9": 0.8214285714285714,
"calib/gap": 0.08259259259259266,
"calib/mean_conf": 0.9103571428571428,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9074074074074073,
"calib/nonempty_final_conf_rate": 0.109375,
"calib/nonempty_reasoning_rate": 0.15625,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.874642857142857,
"calib/std_conf": 0.18690108428289667,
"calib/step_conf_rate": 0.109375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2996.0,
"completions/max_terminated_length": 2996.0,
"completions/mean_length": 623.09375,
"completions/mean_terminated_length": 670.218505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.007296231109648943,
"learning_rate": 2.5e-06,
"loss": 0.0498,
"num_tokens": 2726940.0,
"reward": 0.07001252472400665,
"reward_std": 0.1547928899526596,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.017438281327486038,
"rewards/format_reward_step": 0.09375,
"rewards/stepwise_brier_reward": 0.05167432874441147,
"step": 10
},
{
"calib/answer_extract_rate": 0.16796875,
"calib/auroc": 0.44444444444444436,
"calib/avg_num_step_conf": 0.65234375,
"calib/ece": 0.6481090909090907,
"calib/final_conf_rate": 0.12890625,
"calib/format_rate": 0.10546875,
"calib/frac_conf_gt_0.9": 0.5757575757575758,
"calib/gap": -0.08830277777777784,
"calib/mean_conf": 0.788109090909091,
"calib/mu_c": 0.7238888888888888,
"calib/mu_w": 0.8121916666666666,
"calib/nonempty_final_conf_rate": 0.12890625,
"calib/nonempty_reasoning_rate": 0.19921875,
"calib/nonempty_step_conf_rate": 0.15234375,
"calib/pce": 0.5817454545454543,
"calib/std_conf": 0.2969892237241352,
"calib/step_conf_rate": 0.15234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 615.04296875,
"completions/mean_terminated_length": 681.6060791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.005505493376404047,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0954,
"num_tokens": 2988871.0,
"reward": 0.1337997019290924,
"reward_std": 0.22659480571746826,
"rewards/accuracy_reward_step": 0.03515625,
"rewards/final_brier_reward_step": 0.047456204891204834,
"rewards/format_reward_step": 0.10546875,
"rewards/stepwise_brier_reward": 0.0658676028251648,
"step": 11
},
{
"calib/answer_extract_rate": 0.1796875,
"calib/auroc": 0.53125,
"calib/avg_num_step_conf": 0.68359375,
"calib/ece": 0.5513888888888889,
"calib/final_conf_rate": 0.140625,
"calib/format_rate": 0.11328125,
"calib/frac_conf_gt_0.9": 0.6388888888888888,
"calib/gap": 0.07458333333333345,
"calib/mean_conf": 0.8652777777777777,
"calib/mu_c": 0.9150000000000001,
"calib/mu_w": 0.8404166666666667,
"calib/nonempty_final_conf_rate": 0.140625,
"calib/nonempty_reasoning_rate": 0.21484375,
"calib/nonempty_step_conf_rate": 0.16015625,
"calib/pce": 0.5416666666666666,
"calib/std_conf": 0.20857301028005507,
"calib/step_conf_rate": 0.16015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 631.421875,
"completions/mean_terminated_length": 673.5167236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0128,
"grad_norm": 0.008510539308190346,
"learning_rate": 3e-06,
"loss": 0.0547,
"num_tokens": 3254691.0,
"reward": 0.15881484746932983,
"reward_std": 0.2684420943260193,
"rewards/accuracy_reward_step": 0.046875,
"rewards/final_brier_reward_step": 0.05684414133429527,
"rewards/format_reward_step": 0.11328125,
"rewards/stepwise_brier_reward": 0.07060275226831436,
"step": 12
},
{
"calib/answer_extract_rate": 0.2421875,
"calib/auroc": 0.36122448979591837,
"calib/avg_num_step_conf": 0.90625,
"calib/ece": 0.6199857142857143,
"calib/final_conf_rate": 0.19140625,
"calib/format_rate": 0.15234375,
"calib/frac_conf_gt_0.9": 0.8163265306122449,
"calib/gap": 0.040020000000000056,
"calib/mean_conf": 0.8914142857142857,
"calib/mu_c": 0.9199999999999999,
"calib/mu_w": 0.8799799999999999,
"calib/nonempty_final_conf_rate": 0.19140625,
"calib/nonempty_reasoning_rate": 0.30078125,
"calib/nonempty_step_conf_rate": 0.22265625,
"calib/pce": 0.6128428571428571,
"calib/std_conf": 0.1766206866105313,
"calib/step_conf_rate": 0.22265625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 626.4140625,
"completions/mean_terminated_length": 649.2388916015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.0077047646045684814,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0781,
"num_tokens": 3519645.0,
"reward": 0.2043651044368744,
"reward_std": 0.37253743410110474,
"rewards/accuracy_reward_step": 0.05859375,
"rewards/final_brier_reward_step": 0.07053398340940475,
"rewards/format_reward_step": 0.15234375,
"rewards/stepwise_brier_reward": 0.09067648649215698,
"step": 13
},
{
"calib/answer_extract_rate": 0.296875,
"calib/auroc": 0.5828804347826086,
"calib/avg_num_step_conf": 1.25,
"calib/ece": 0.6054903225806453,
"calib/final_conf_rate": 0.2421875,
"calib/format_rate": 0.21484375,
"calib/frac_conf_gt_0.9": 0.7419354838709677,
"calib/gap": 0.09208369565217389,
"calib/mean_conf": 0.8635548387096774,
"calib/mu_c": 0.931875,
"calib/mu_w": 0.8397913043478261,
"calib/nonempty_final_conf_rate": 0.2421875,
"calib/nonempty_reasoning_rate": 0.34375,
"calib/nonempty_step_conf_rate": 0.28125,
"calib/pce": 0.6054903225806453,
"calib/std_conf": 0.23968853019876643,
"calib/step_conf_rate": 0.28125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 3049.0,
"completions/max_terminated_length": 3049.0,
"completions/mean_length": 541.01171875,
"completions/mean_terminated_length": 579.4937133789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.00938513595610857,
"learning_rate": 3.5e-06,
"loss": 0.0275,
"num_tokens": 3763544.0,
"reward": 0.25877830386161804,
"reward_std": 0.35493573546409607,
"rewards/accuracy_reward_step": 0.0625,
"rewards/final_brier_reward_step": 0.09642301499843597,
"rewards/format_reward_step": 0.21484375,
"rewards/stepwise_brier_reward": 0.1340026557445526,
"step": 14
},
{
"calib/answer_extract_rate": 0.43359375,
"calib/auroc": 0.4211601307189543,
"calib/avg_num_step_conf": 1.921875,
"calib/ece": 0.7240764044943823,
"calib/final_conf_rate": 0.34765625,
"calib/format_rate": 0.3203125,
"calib/frac_conf_gt_0.9": 0.7415730337078652,
"calib/gap": -0.017873856209150274,
"calib/mean_conf": 0.8809303370786516,
"calib/mu_c": 0.8664705882352941,
"calib/mu_w": 0.8843444444444444,
"calib/nonempty_final_conf_rate": 0.34765625,
"calib/nonempty_reasoning_rate": 0.45703125,
"calib/nonempty_step_conf_rate": 0.3671875,
"calib/pce": 0.706997752808989,
"calib/std_conf": 0.2148621091155773,
"calib/step_conf_rate": 0.3671875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 501.13671875,
"completions/mean_terminated_length": 525.7827758789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.016,
"grad_norm": 0.011989172548055649,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0734,
"num_tokens": 3999715.0,
"reward": 0.33935385942459106,
"reward_std": 0.4770300090312958,
"rewards/accuracy_reward_step": 0.0703125,
"rewards/final_brier_reward_step": 0.10058455169200897,
"rewards/format_reward_step": 0.3203125,
"rewards/stepwise_brier_reward": 0.19433088600635529,
"step": 15
},
{
"calib/answer_extract_rate": 0.46875,
"calib/auroc": 0.606875,
"calib/avg_num_step_conf": 2.4140625,
"calib/ece": 0.6073454545454546,
"calib/final_conf_rate": 0.4296875,
"calib/format_rate": 0.33984375,
"calib/frac_conf_gt_0.9": 0.7363636363636363,
"calib/gap": 0.08548333333333324,
"calib/mean_conf": 0.8651636363636362,
"calib/mu_c": 0.9273333333333332,
"calib/mu_w": 0.84185,
"calib/nonempty_final_conf_rate": 0.4296875,
"calib/nonempty_reasoning_rate": 0.5703125,
"calib/nonempty_step_conf_rate": 0.46875,
"calib/pce": 0.5998909090909091,
"calib/std_conf": 0.24001157272373827,
"calib/step_conf_rate": 0.46875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 515.44140625,
"completions/mean_terminated_length": 536.394287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.010875885374844074,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0832,
"num_tokens": 4240516.0,
"reward": 0.4370579421520233,
"reward_std": 0.5463830232620239,
"rewards/accuracy_reward_step": 0.1171875,
"rewards/final_brier_reward_step": 0.15203005075454712,
"rewards/format_reward_step": 0.33984375,
"rewards/stepwise_brier_reward": 0.21338918805122375,
"step": 16
},
{
"calib/answer_extract_rate": 0.61328125,
"calib/auroc": 0.509936766034327,
"calib/avg_num_step_conf": 3.4453125,
"calib/ece": 0.6482816554809844,
"calib/final_conf_rate": 0.58203125,
"calib/format_rate": 0.52734375,
"calib/frac_conf_gt_0.9": 0.697986577181208,
"calib/gap": -0.05793030713640468,
"calib/mean_conf": 0.86016129753915,
"calib/mu_c": 0.8181715447154472,
"calib/mu_w": 0.8761018518518519,
"calib/nonempty_final_conf_rate": 0.58203125,
"calib/nonempty_reasoning_rate": 0.70703125,
"calib/nonempty_step_conf_rate": 0.64453125,
"calib/pce": 0.6166375838926175,
"calib/std_conf": 0.22649809088873865,
"calib/step_conf_rate": 0.64453125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 437.734375,
"completions/mean_terminated_length": 448.2400207519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.011370422318577766,
"learning_rate": 4.25e-06,
"loss": 0.0739,
"num_tokens": 4456104.0,
"reward": 0.6475951075553894,
"reward_std": 0.6690946817398071,
"rewards/accuracy_reward_step": 0.1640625,
"rewards/final_brier_reward_step": 0.21312937140464783,
"rewards/format_reward_step": 0.52734375,
"rewards/stepwise_brier_reward": 0.3381884694099426,
"step": 17
},
{
"calib/answer_extract_rate": 0.66015625,
"calib/auroc": 0.5611111111111111,
"calib/avg_num_step_conf": 3.05859375,
"calib/ece": 0.7279539393939392,
"calib/final_conf_rate": 0.64453125,
"calib/format_rate": 0.56640625,
"calib/frac_conf_gt_0.9": 0.7515151515151515,
"calib/gap": 0.053852592592592674,
"calib/mean_conf": 0.9097721212121211,
"calib/mu_c": 0.9538333333333334,
"calib/mu_w": 0.8999807407407407,
"calib/nonempty_final_conf_rate": 0.64453125,
"calib/nonempty_reasoning_rate": 0.73828125,
"calib/nonempty_step_conf_rate": 0.6796875,
"calib/pce": 0.7279539393939392,
"calib/std_conf": 0.15579792646065557,
"calib/step_conf_rate": 0.6796875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 450.29296875,
"completions/mean_terminated_length": 455.6324157714844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0192,
"grad_norm": 0.010597813874483109,
"learning_rate": 4.5e-06,
"loss": 0.2026,
"num_tokens": 4682099.0,
"reward": 0.6116340756416321,
"reward_std": 0.6417558193206787,
"rewards/accuracy_reward_step": 0.12890625,
"rewards/final_brier_reward_step": 0.1919897496700287,
"rewards/format_reward_step": 0.56640625,
"rewards/stepwise_brier_reward": 0.34829652309417725,
"step": 18
},
{
"calib/answer_extract_rate": 0.89453125,
"calib/auroc": 0.5453404223896027,
"calib/avg_num_step_conf": 4.3046875,
"calib/ece": 0.7384040909090911,
"calib/final_conf_rate": 0.859375,
"calib/format_rate": 0.80078125,
"calib/frac_conf_gt_0.9": 0.7454545454545455,
"calib/gap": 0.018375911977551285,
"calib/mean_conf": 0.8987686363636365,
"calib/mu_c": 0.914054054054054,
"calib/mu_w": 0.8956781420765028,
"calib/nonempty_final_conf_rate": 0.859375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.7344954545454547,
"calib/std_conf": 0.1694991208557371,
"calib/step_conf_rate": 0.90625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2385.0,
"completions/max_terminated_length": 2385.0,
"completions/mean_length": 289.2265625,
"completions/mean_terminated_length": 291.5039367675781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.01285602804273367,
"learning_rate": 4.75e-06,
"loss": 0.0468,
"num_tokens": 4860901.0,
"reward": 0.815706729888916,
"reward_std": 0.6081791520118713,
"rewards/accuracy_reward_step": 0.15234375,
"rewards/final_brier_reward_step": 0.24730388820171356,
"rewards/format_reward_step": 0.80078125,
"rewards/stepwise_brier_reward": 0.49989813566207886,
"step": 19
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.5055555555555555,
"calib/avg_num_step_conf": 4.4140625,
"calib/ece": 0.5623347639484979,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 0.703862660944206,
"calib/gap": 0.01590294117647051,
"calib/mean_conf": 0.8831072961373391,
"calib/mu_c": 0.89355,
"calib/mu_w": 0.8776470588235294,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.5510472103004291,
"calib/std_conf": 0.19168427095662832,
"calib/step_conf_rate": 0.94140625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2143.0,
"completions/max_terminated_length": 2143.0,
"completions/mean_length": 277.91015625,
"completions/mean_terminated_length": 277.91015625,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.010729345493018627,
"learning_rate": 5e-06,
"loss": 0.0631,
"num_tokens": 5036918.0,
"reward": 1.1385329961776733,
"reward_std": 0.6992599368095398,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.40258094668388367,
"rewards/format_reward_step": 0.86328125,
"rewards/stepwise_brier_reward": 0.5499885678291321,
"step": 20
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.491985049833887,
"calib/avg_num_step_conf": 4.609375,
"calib/ece": 0.6315103305785124,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.731404958677686,
"calib/gap": -0.00557931893687702,
"calib/mean_conf": 0.9006797520661156,
"calib/mu_c": 0.8967142857142858,
"calib/mu_w": 0.9022936046511628,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.6214669421487603,
"calib/std_conf": 0.17603323585962752,
"calib/step_conf_rate": 0.96875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2280.0,
"completions/max_terminated_length": 2280.0,
"completions/mean_length": 272.87890625,
"completions/mean_terminated_length": 272.87890625,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.0224,
"grad_norm": 0.009412923827767372,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0192,
"num_tokens": 5209735.0,
"reward": 1.085858941078186,
"reward_std": 0.7268727421760559,
"rewards/accuracy_reward_step": 0.2734375,
"rewards/final_brier_reward_step": 0.34340929985046387,
"rewards/format_reward_step": 0.890625,
"rewards/stepwise_brier_reward": 0.5781517028808594,
"step": 21
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.49042674795199087,
"calib/avg_num_step_conf": 4.50390625,
"calib/ece": 0.6581171548117155,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.7280334728033473,
"calib/gap": 0.022399504667555692,
"calib/mean_conf": 0.9007949790794979,
"calib/mu_c": 0.9177586206896552,
"calib/mu_w": 0.8953591160220995,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.6581171548117155,
"calib/std_conf": 0.15258098822075492,
"calib/step_conf_rate": 0.96875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2229.0,
"completions/max_terminated_length": 2229.0,
"completions/mean_length": 268.80078125,
"completions/mean_terminated_length": 268.80078125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.007968787103891373,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0675,
"num_tokens": 5380364.0,
"reward": 1.0447142124176025,
"reward_std": 0.7075128555297852,
"rewards/accuracy_reward_step": 0.23828125,
"rewards/final_brier_reward_step": 0.33695703744888306,
"rewards/format_reward_step": 0.90625,
"rewards/stepwise_brier_reward": 0.5997121930122375,
"step": 22
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.48513756960366855,
"calib/avg_num_step_conf": 4.1875,
"calib/ece": 0.6086831275720165,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.691358024691358,
"calib/gap": 0.008890435637078142,
"calib/mean_conf": 0.8956790123456789,
"calib/mu_c": 0.9019718309859154,
"calib/mu_w": 0.8930813953488372,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.6060905349794239,
"calib/std_conf": 0.15974726648051205,
"calib/step_conf_rate": 0.96875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1482.0,
"completions/max_terminated_length": 1482.0,
"completions/mean_length": 253.1328125,
"completions/mean_terminated_length": 253.1328125,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.008745127357542515,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0485,
"num_tokens": 5549102.0,
"reward": 1.1360033750534058,
"reward_std": 0.6448456645011902,
"rewards/accuracy_reward_step": 0.2890625,
"rewards/final_brier_reward_step": 0.36885470151901245,
"rewards/format_reward_step": 0.91796875,
"rewards/stepwise_brier_reward": 0.6048462986946106,
"step": 23
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4067348219890593,
"calib/avg_num_step_conf": 4.7109375,
"calib/ece": 0.6607661290322581,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6975806451612904,
"calib/gap": -0.025826383284010324,
"calib/mean_conf": 0.88375,
"calib/mu_c": 0.8640677966101695,
"calib/mu_w": 0.8898941798941798,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.6533064516129032,
"calib/std_conf": 0.18359989480108027,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1887.0,
"completions/max_terminated_length": 1887.0,
"completions/mean_length": 264.109375,
"completions/mean_terminated_length": 264.109375,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.0256,
"grad_norm": 0.008865960873663425,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0483,
"num_tokens": 5721226.0,
"reward": 1.070737600326538,
"reward_std": 0.50026535987854,
"rewards/accuracy_reward_step": 0.23046875,
"rewards/final_brier_reward_step": 0.34463945031166077,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.6492486596107483,
"step": 24
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.48281747837960853,
"calib/avg_num_step_conf": 4.47265625,
"calib/ece": 0.5816931174089068,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6761133603238867,
"calib/gap": 0.00861025641025659,
"calib/mean_conf": 0.8766728744939271,
"calib/mu_c": 0.8825641025641028,
"calib/mu_w": 0.8739538461538462,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.5712882591093117,
"calib/std_conf": 0.19493444354551928,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 544.0,
"completions/max_terminated_length": 544.0,
"completions/mean_length": 235.96484375,
"completions/mean_terminated_length": 236.8902130126953,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.00847149733453989,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0065,
"num_tokens": 5884857.0,
"reward": 1.2149608135223389,
"reward_std": 0.6195580959320068,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.41003310680389404,
"rewards/format_reward_step": 0.94921875,
"rewards/stepwise_brier_reward": 0.6763729453086853,
"step": 25
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4211283571677712,
"calib/avg_num_step_conf": 4.70703125,
"calib/ece": 0.6318273092369477,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5622489959839357,
"calib/gap": -0.044770230205790185,
"calib/mean_conf": 0.8554417670682731,
"calib/mu_c": 0.8216393442622949,
"calib/mu_w": 0.8664095744680851,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.621144578313253,
"calib/std_conf": 0.1857087952927548,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1691.0,
"completions/max_terminated_length": 1691.0,
"completions/mean_length": 277.8828125,
"completions/mean_terminated_length": 278.9725646972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.008542955853044987,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0436,
"num_tokens": 6061235.0,
"reward": 1.12515127658844,
"reward_std": 0.5599552392959595,
"rewards/accuracy_reward_step": 0.23828125,
"rewards/final_brier_reward_step": 0.3806217610836029,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7449833154678345,
"step": 26
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.47859314305016043,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.5581027667984191,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.42292490118577075,
"calib/gap": 0.0070055733828744415,
"calib/mean_conf": 0.7863241106719367,
"calib/mu_c": 0.7916129032258065,
"calib/mu_w": 0.784607329842932,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5496837944664033,
"calib/std_conf": 0.23719941596990085,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1065.0,
"completions/max_terminated_length": 1065.0,
"completions/mean_length": 270.0546875,
"completions/mean_terminated_length": 271.1137390136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.0288,
"grad_norm": 0.00831306166946888,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0214,
"num_tokens": 6235585.0,
"reward": 1.1712658405303955,
"reward_std": 0.5169739723205566,
"rewards/accuracy_reward_step": 0.24609375,
"rewards/final_brier_reward_step": 0.4588109254837036,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.7887527346611023,
"step": 27
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5224042637189104,
"calib/avg_num_step_conf": 4.34765625,
"calib/ece": 0.3866533864541833,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3784860557768924,
"calib/gap": 0.03475259902618777,
"calib/mean_conf": 0.753585657370518,
"calib/mu_c": 0.77421568627451,
"calib/mu_w": 0.7394630872483222,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3669322709163347,
"calib/std_conf": 0.2642328813209692,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 290.796875,
"completions/mean_terminated_length": 290.796875,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.007810839917510748,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0126,
"num_tokens": 6416973.0,
"reward": 1.4253153800964355,
"reward_std": 0.6782314777374268,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.5682843923568726,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.8126649856567383,
"step": 28
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.457579185520362,
"calib/avg_num_step_conf": 4.65625,
"calib/ece": 0.42056000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.24,
"calib/gap": -0.022616354234001212,
"calib/mean_conf": 0.6572,
"calib/mu_c": 0.6407352941176472,
"calib/mu_w": 0.6633516483516484,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40288000000000007,
"calib/std_conf": 0.269727566258994,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2290.0,
"completions/max_terminated_length": 2290.0,
"completions/mean_length": 343.81640625,
"completions/mean_terminated_length": 343.81640625,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.006815528497099876,
"learning_rate": 4.75e-06,
"loss": 0.1056,
"num_tokens": 6612118.0,
"reward": 1.2306079864501953,
"reward_std": 0.5452619791030884,
"rewards/accuracy_reward_step": 0.265625,
"rewards/final_brier_reward_step": 0.5531593561172485,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.8458348512649536,
"step": 29
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4404323513366067,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.29564,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.168,
"calib/gap": -0.057594108019640045,
"calib/mean_conf": 0.56796,
"calib/mu_c": 0.5320212765957446,
"calib/mu_w": 0.5896153846153847,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.24380000000000002,
"calib/std_conf": 0.28477401285931975,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 333.375,
"completions/mean_terminated_length": 333.375,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.032,
"grad_norm": 0.00658390112221241,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0417,
"num_tokens": 6804446.0,
"reward": 1.3912838697433472,
"reward_std": 0.6227531433105469,
"rewards/accuracy_reward_step": 0.3671875,
"rewards/final_brier_reward_step": 0.5929761528968811,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8471591472625732,
"step": 30
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4990901571546733,
"calib/avg_num_step_conf": 4.70703125,
"calib/ece": 0.2562549800796813,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.07171314741035857,
"calib/gap": 0.005024813895781632,
"calib/mean_conf": 0.47243027888446215,
"calib/mu_c": 0.47615384615384615,
"calib/mu_w": 0.4711290322580645,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23486055776892434,
"calib/std_conf": 0.26192984013835136,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1807.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 324.54296875,
"completions/mean_terminated_length": 324.54296875,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.006656688638031483,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0647,
"num_tokens": 6993441.0,
"reward": 1.260326862335205,
"reward_std": 0.46390360593795776,
"rewards/accuracy_reward_step": 0.25390625,
"rewards/final_brier_reward_step": 0.6808546781539917,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8838905096054077,
"step": 31
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5255952380952381,
"calib/avg_num_step_conf": 4.4765625,
"calib/ece": 0.18403162055335967,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.05138339920948617,
"calib/gap": 0.003857843137254846,
"calib/mean_conf": 0.38837944664031615,
"calib/mu_c": 0.3909411764705882,
"calib/mu_w": 0.38708333333333333,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11822134387351776,
"calib/std_conf": 0.24734507726159338,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2182.0,
"completions/max_terminated_length": 2182.0,
"completions/mean_length": 325.09375,
"completions/mean_terminated_length": 325.09375,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.007369569502770901,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0273,
"num_tokens": 7183369.0,
"reward": 1.3937727212905884,
"reward_std": 0.4453020393848419,
"rewards/accuracy_reward_step": 0.33203125,
"rewards/final_brier_reward_step": 0.7063257694244385,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9000150561332703,
"step": 32
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.507635052179251,
"calib/avg_num_step_conf": 4.73046875,
"calib/ece": 0.18476877470355732,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.043478260869565216,
"calib/gap": -0.0005787292817679979,
"calib/mean_conf": 0.38416403162055335,
"calib/mu_c": 0.38375,
"calib/mu_w": 0.384328729281768,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14217391304347826,
"calib/std_conf": 0.250309969440167,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 356.01171875,
"completions/mean_terminated_length": 356.01171875,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.0352,
"grad_norm": 0.00666170846670866,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0823,
"num_tokens": 7381380.0,
"reward": 1.3077948093414307,
"reward_std": 0.34341514110565186,
"rewards/accuracy_reward_step": 0.28125,
"rewards/final_brier_reward_step": 0.7101441621780396,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8804100155830383,
"step": 33
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5705141300646918,
"calib/avg_num_step_conf": 4.48828125,
"calib/ece": 0.17610236220472444,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": 0.047890364317330625,
"calib/mean_conf": 0.3272047244094488,
"calib/mu_c": 0.35831460674157306,
"calib/mu_w": 0.31042424242424244,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07645669291338583,
"calib/std_conf": 0.2094378993364557,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1350.0,
"completions/max_terminated_length": 1350.0,
"completions/mean_length": 302.9375,
"completions/mean_terminated_length": 302.9375,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.007425271440297365,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0247,
"num_tokens": 7564044.0,
"reward": 1.4313955307006836,
"reward_std": 0.5110543966293335,
"rewards/accuracy_reward_step": 0.34765625,
"rewards/final_brier_reward_step": 0.7439238429069519,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9113458395004272,
"step": 34
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5492224283064632,
"calib/avg_num_step_conf": 4.828125,
"calib/ece": 0.15236947791164657,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.012048192771084338,
"calib/gap": 0.025039235268939974,
"calib/mean_conf": 0.2701204819277108,
"calib/mu_c": 0.2865116279069768,
"calib/mu_w": 0.26147239263803684,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.038554216867469876,
"calib/std_conf": 0.2011981818398749,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2567.0,
"completions/max_terminated_length": 2567.0,
"completions/mean_length": 390.05078125,
"completions/mean_terminated_length": 391.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.005499858409166336,
"learning_rate": 4.583333333333333e-06,
"loss": 0.1043,
"num_tokens": 7773153.0,
"reward": 1.3877594470977783,
"reward_std": 0.5350881814956665,
"rewards/accuracy_reward_step": 0.33984375,
"rewards/final_brier_reward_step": 0.7160624861717224,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.8662256002426147,
"step": 35
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4528992248062016,
"calib/avg_num_step_conf": 4.6875,
"calib/ece": 0.32232283464566924,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": -0.03354976744186047,
"calib/mean_conf": 0.2274409448818898,
"calib/mu_c": 0.21093023255813953,
"calib/mu_w": 0.24448,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.020944881889763782,
"calib/std_conf": 0.18583114651863405,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 345.890625,
"completions/mean_terminated_length": 345.890625,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.0384,
"grad_norm": 0.0065825642086565495,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0394,
"num_tokens": 7964413.0,
"reward": 1.6298774480819702,
"reward_std": 0.4798555374145508,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.615270733833313,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8964267373085022,
"step": 36
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.43758962103106863,
"calib/avg_num_step_conf": 4.3046875,
"calib/ece": 0.28337398373983747,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.0040650406504065045,
"calib/gap": -0.02138818709457152,
"calib/mean_conf": 0.1728048780487805,
"calib/mu_c": 0.1601980198019802,
"calib/mu_w": 0.18158620689655172,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.022804878048780487,
"calib/std_conf": 0.16382328167412918,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2505.0,
"completions/max_terminated_length": 2505.0,
"completions/mean_length": 387.6953125,
"completions/mean_terminated_length": 389.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.006749349180608988,
"learning_rate": 4.527777777777778e-06,
"loss": 0.135,
"num_tokens": 8170759.0,
"reward": 1.4470248222351074,
"reward_std": 0.43770620226860046,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.6383277177810669,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8607094287872314,
"step": 37
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.41320792609452406,
"calib/avg_num_step_conf": 4.125,
"calib/ece": 0.3131474103585657,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.05134020618556702,
"calib/mean_conf": 0.1401593625498008,
"calib/mu_c": 0.10865979381443298,
"calib/mu_w": 0.16,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.033426294820717135,
"calib/std_conf": 0.132742953617327,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2396.0,
"completions/max_terminated_length": 2396.0,
"completions/mean_length": 392.48828125,
"completions/mean_terminated_length": 392.48828125,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.006610847543925047,
"learning_rate": 4.5e-06,
"loss": 0.1519,
"num_tokens": 8378124.0,
"reward": 1.4403471946716309,
"reward_std": 0.4136509895324707,
"rewards/accuracy_reward_step": 0.37890625,
"rewards/final_brier_reward_step": 0.6473687291145325,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8796446919441223,
"step": 38
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.45158783783783785,
"calib/avg_num_step_conf": 4.17578125,
"calib/ece": 0.3109697580645162,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.004032258064516129,
"calib/gap": -0.03289824324324324,
"calib/mean_conf": 0.13335282258064515,
"calib/mu_c": 0.11372,
"calib/mu_w": 0.14661824324324324,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.020548387096774194,
"calib/std_conf": 0.15494879979060558,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2362.0,
"completions/max_terminated_length": 2362.0,
"completions/mean_length": 410.68359375,
"completions/mean_terminated_length": 410.68359375,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.0416,
"grad_norm": 0.006809773854911327,
"learning_rate": 4.472222222222223e-06,
"loss": 0.147,
"num_tokens": 8589347.0,
"reward": 1.4385499954223633,
"reward_std": 0.4814203977584839,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.6225799322128296,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.8581823706626892,
"step": 39
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.526044761338879,
"calib/avg_num_step_conf": 4.35546875,
"calib/ece": 0.28428571428571425,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0007724301841949277,
"calib/mean_conf": 0.1364285714285714,
"calib/mu_c": 0.13595959595959595,
"calib/mu_w": 0.13673202614379087,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.013928571428571427,
"calib/std_conf": 0.14631431856996807,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2341.0,
"completions/max_terminated_length": 2341.0,
"completions/mean_length": 407.9609375,
"completions/mean_terminated_length": 407.9609375,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.006085592322051525,
"learning_rate": 4.444444444444444e-06,
"loss": 0.1084,
"num_tokens": 8800545.0,
"reward": 1.454925537109375,
"reward_std": 0.43846869468688965,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.6634172201156616,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8672226667404175,
"step": 40
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.518595041322314,
"calib/avg_num_step_conf": 4.0390625,
"calib/ece": 0.5467490118577075,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.003952569169960474,
"calib/gap": -0.008151515151515173,
"calib/mean_conf": 0.12036561264822135,
"calib/mu_c": 0.11753030303030303,
"calib/mu_w": 0.1256818181818182,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.007470355731225296,
"calib/std_conf": 0.12852342384790832,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 348.51953125,
"completions/mean_terminated_length": 348.51953125,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.007085585966706276,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0491,
"num_tokens": 8997014.0,
"reward": 1.7957595586776733,
"reward_std": 0.4599454700946808,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.46078142523765564,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8863190412521362,
"step": 41
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.531181969522346,
"calib/avg_num_step_conf": 3.80859375,
"calib/ece": 0.3637450199203187,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0048187988218721944,
"calib/mean_conf": 0.09824701195219124,
"calib/mu_c": 0.10087719298245613,
"calib/mu_w": 0.09605839416058394,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.003904382470119521,
"calib/std_conf": 0.10768532964222766,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1757.0,
"completions/max_terminated_length": 1757.0,
"completions/mean_length": 317.49609375,
"completions/mean_terminated_length": 317.49609375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0448,
"grad_norm": 0.008215104229748249,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0404,
"num_tokens": 9182661.0,
"reward": 1.5414741039276123,
"reward_std": 0.3999992907047272,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6019105315208435,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8921107053756714,
"step": 42
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6046896903275136,
"calib/avg_num_step_conf": 3.7421875,
"calib/ece": 0.3801195219123506,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.025647381164776342,
"calib/mean_conf": 0.09581673306772909,
"calib/mu_c": 0.10940677966101695,
"calib/mu_w": 0.0837593984962406,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.002908366533864542,
"calib/std_conf": 0.08601648062509853,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 413.08984375,
"completions/mean_terminated_length": 413.08984375,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.006696638185530901,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0935,
"num_tokens": 9393636.0,
"reward": 1.5401055812835693,
"reward_std": 0.4751012325286865,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5967199206352234,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8605777025222778,
"step": 43
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5637673243667646,
"calib/avg_num_step_conf": 3.55859375,
"calib/ece": 0.2813432795698924,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.004032258064516129,
"calib/gap": 0.010013213172208185,
"calib/mean_conf": 0.12905994623655914,
"calib/mu_c": 0.13515670103092783,
"calib/mu_w": 0.12514348785871965,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.009637096774193547,
"calib/std_conf": 0.11519578926866243,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 407.40234375,
"completions/mean_terminated_length": 409.0000305175781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.006890473887324333,
"learning_rate": 4.333333333333334e-06,
"loss": 0.123,
"num_tokens": 9604251.0,
"reward": 1.4265779256820679,
"reward_std": 0.394540011882782,
"rewards/accuracy_reward_step": 0.37890625,
"rewards/final_brier_reward_step": 0.6516605615615845,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.8671508431434631,
"step": 44
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5166147455867081,
"calib/avg_num_step_conf": 3.3828125,
"calib/ece": 0.3221752988047809,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.002900960539979236,
"calib/mean_conf": 0.11886055776892432,
"calib/mu_c": 0.117196261682243,
"calib/mu_w": 0.12009722222222223,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.007370517928286852,
"calib/std_conf": 0.10039789603361092,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2149.0,
"completions/max_terminated_length": 2149.0,
"completions/mean_length": 348.75,
"completions/mean_terminated_length": 350.11767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.048,
"grad_norm": 0.008182951249182224,
"learning_rate": 4.305555555555556e-06,
"loss": 0.0829,
"num_tokens": 9798579.0,
"reward": 1.483351230621338,
"reward_std": 0.47646263241767883,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6229609251022339,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8807565569877625,
"step": 45
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5675109760216144,
"calib/avg_num_step_conf": 3.1484375,
"calib/ece": 0.282601219512195,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.012485612968591686,
"calib/mean_conf": 0.16008170731707316,
"calib/mu_c": 0.16723809523809527,
"calib/mu_w": 0.15475248226950358,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.007926829268292683,
"calib/std_conf": 0.1016763230323449,
"calib/step_conf_rate": 0.9765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2417.0,
"completions/max_terminated_length": 2417.0,
"completions/mean_length": 388.64453125,
"completions/mean_terminated_length": 388.64453125,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.0073912180960178375,
"learning_rate": 4.277777777777778e-06,
"loss": 0.1144,
"num_tokens": 10002840.0,
"reward": 1.4707672595977783,
"reward_std": 0.4773736000061035,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/final_brier_reward_step": 0.6400457620620728,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.875836193561554,
"step": 46
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.4782161803713528,
"calib/avg_num_step_conf": 3.03515625,
"calib/ece": 0.3222764227642277,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.008130081300813009,
"calib/gap": -0.016023872679045065,
"calib/mean_conf": 0.16967479674796748,
"calib/mu_c": 0.16120689655172415,
"calib/mu_w": 0.17723076923076922,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.010203252032520326,
"calib/std_conf": 0.12033783717456865,
"calib/step_conf_rate": 0.9609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2722.0,
"completions/max_terminated_length": 2722.0,
"completions/mean_length": 407.453125,
"completions/mean_terminated_length": 410.6614074707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.007095417007803917,
"learning_rate": 4.25e-06,
"loss": 0.1039,
"num_tokens": 10213124.0,
"reward": 1.5107771158218384,
"reward_std": 0.47904813289642334,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.5969375371932983,
"rewards/format_reward_step": 0.93359375,
"rewards/stepwise_brier_reward": 0.8602335453033447,
"step": 47
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4848875661375662,
"calib/avg_num_step_conf": 2.63671875,
"calib/ece": 0.28132530120481924,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.004016064257028112,
"calib/gap": -0.013351190476190405,
"calib/mean_conf": 0.17457831325301204,
"calib/mu_c": 0.1668571428571429,
"calib/mu_w": 0.1802083333333333,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.017108433734939754,
"calib/std_conf": 0.10642003470546849,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2849.0,
"completions/max_terminated_length": 2849.0,
"completions/mean_length": 349.9921875,
"completions/mean_terminated_length": 349.9921875,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.0512,
"grad_norm": 0.008383657783269882,
"learning_rate": 4.222222222222223e-06,
"loss": 0.1454,
"num_tokens": 10406410.0,
"reward": 1.4861187934875488,
"reward_std": 0.49443191289901733,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6513808369636536,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.910281777381897,
"step": 48
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5583248666497332,
"calib/avg_num_step_conf": 2.9296875,
"calib/ece": 0.2803187250996016,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02079565659131316,
"calib/mean_conf": 0.2164940239043825,
"calib/mu_c": 0.22701612903225804,
"calib/mu_w": 0.20622047244094488,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.001394422310756969,
"calib/std_conf": 0.10320201712557886,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2487.0,
"completions/max_terminated_length": 2487.0,
"completions/mean_length": 345.34375,
"completions/mean_terminated_length": 345.34375,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.008482473902404308,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0898,
"num_tokens": 10599354.0,
"reward": 1.5967867374420166,
"reward_std": 0.4762223958969116,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.64497971534729,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.9062297344207764,
"step": 49
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5345538461538462,
"calib/avg_num_step_conf": 2.8125,
"calib/ece": 0.2992549019607843,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.012676923076923058,
"calib/mean_conf": 0.21486274509803924,
"calib/mu_c": 0.22107692307692306,
"calib/mu_w": 0.2084,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.0021568627450980395,
"calib/std_conf": 0.10195289562242836,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 359.57421875,
"completions/mean_terminated_length": 359.57421875,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.008044109679758549,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0145,
"num_tokens": 10796765.0,
"reward": 1.640196681022644,
"reward_std": 0.5175546407699585,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6471250057220459,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.9136614799499512,
"step": 50
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6100285013587857,
"calib/avg_num_step_conf": 2.7421875,
"calib/ece": 0.21911290322580645,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.03341419765360906,
"calib/mean_conf": 0.25016129032258067,
"calib/mu_c": 0.2691588785046729,
"calib/mu_w": 0.23574468085106384,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.018911290322580647,
"calib/std_conf": 0.12980434327794635,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2352.0,
"completions/max_terminated_length": 2352.0,
"completions/mean_length": 336.10546875,
"completions/mean_terminated_length": 338.751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.0544,
"grad_norm": 0.008155681192874908,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0826,
"num_tokens": 10992104.0,
"reward": 1.497969150543213,
"reward_std": 0.4611200988292694,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6864816546440125,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.8913326859474182,
"step": 51
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6185649599442704,
"calib/avg_num_step_conf": 2.609375,
"calib/ece": 0.4160317460317461,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.039866248693834955,
"calib/mean_conf": 0.2458730158730159,
"calib/mu_c": 0.2596363636363636,
"calib/mu_w": 0.21977011494252865,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.003571428571428571,
"calib/std_conf": 0.11298805837498471,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1465.0,
"completions/max_terminated_length": 1465.0,
"completions/mean_length": 305.45703125,
"completions/mean_terminated_length": 305.45703125,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.008880858309566975,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0706,
"num_tokens": 11178253.0,
"reward": 1.8421229124069214,
"reward_std": 0.402671217918396,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.5967246294021606,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.9280170202255249,
"step": 52
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5627160968113715,
"calib/avg_num_step_conf": 2.57421875,
"calib/ece": 0.2903187250996016,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02710846459213731,
"calib/mean_conf": 0.25549800796812755,
"calib/mu_c": 0.2678102189781022,
"calib/mu_w": 0.2407017543859649,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0,
"calib/std_conf": 0.10445481207180506,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 302.88671875,
"completions/mean_terminated_length": 304.07452392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.00845205970108509,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0155,
"num_tokens": 11361616.0,
"reward": 1.682370662689209,
"reward_std": 0.4656108617782593,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6496828198432922,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.9235502481460571,
"step": 53
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5679595612528082,
"calib/avg_num_step_conf": 2.71875,
"calib/ece": 0.32588235294117646,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.031105457909343215,
"calib/mean_conf": 0.3054901960784314,
"calib/mu_c": 0.3169565217391304,
"calib/mu_w": 0.2858510638297872,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0,
"calib/std_conf": 0.12430692132056384,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 582.0,
"completions/max_terminated_length": 582.0,
"completions/mean_length": 264.8515625,
"completions/mean_terminated_length": 265.89019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.0576,
"grad_norm": 0.010827116668224335,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0316,
"num_tokens": 11535650.0,
"reward": 1.833878517150879,
"reward_std": 0.3862270414829254,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6499722599983215,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9355419874191284,
"step": 54
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5752223966509681,
"calib/avg_num_step_conf": 2.56640625,
"calib/ece": 0.08820717131474104,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.00398406374501992,
"calib/gap": 0.027567373103087323,
"calib/mean_conf": 0.3441434262948207,
"calib/mu_c": 0.3602884615384615,
"calib/mu_w": 0.3327210884353742,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.009003984063745026,
"calib/std_conf": 0.12284623947196727,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1956.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 309.7421875,
"completions/mean_terminated_length": 309.7421875,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.009377531707286835,
"learning_rate": 4.027777777777779e-06,
"loss": 0.075,
"num_tokens": 11722768.0,
"reward": 1.5037016868591309,
"reward_std": 0.5124849081039429,
"rewards/accuracy_reward_step": 0.40625,
"rewards/final_brier_reward_step": 0.7266761660575867,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.913130521774292,
"step": 55
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5741666666666666,
"calib/avg_num_step_conf": 2.83203125,
"calib/ece": 0.08925781250000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.03299743589743592,
"calib/mean_conf": 0.3869921875,
"calib/mu_c": 0.4071,
"calib/mu_w": 0.3741025641025641,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04281250000000002,
"calib/std_conf": 0.13304111183752504,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1018.0,
"completions/max_terminated_length": 1018.0,
"completions/mean_length": 317.36328125,
"completions/mean_terminated_length": 318.60784912109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.008900254033505917,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0154,
"num_tokens": 11910853.0,
"reward": 1.5095279216766357,
"reward_std": 0.46018385887145996,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.7599589824676514,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.934402585029602,
"step": 56
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5446467408189897,
"calib/avg_num_step_conf": 2.96875,
"calib/ece": 0.11055118110236216,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02669110793512708,
"calib/mean_conf": 0.4231496062992126,
"calib/mu_c": 0.43586466165413534,
"calib/mu_w": 0.40917355371900826,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.005039370078740154,
"calib/std_conf": 0.12755854476421674,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 978.0,
"completions/max_terminated_length": 978.0,
"completions/mean_length": 314.765625,
"completions/mean_terminated_length": 316.0000305175781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.0608,
"grad_norm": 0.008326790295541286,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0128,
"num_tokens": 12098225.0,
"reward": 1.6808912754058838,
"reward_std": 0.5460004806518555,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7244125008583069,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9132156372070312,
"step": 57
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6601914414414415,
"calib/avg_num_step_conf": 3.140625,
"calib/ece": 0.05435294117647061,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.06885510510510517,
"calib/mean_conf": 0.43733333333333335,
"calib/mu_c": 0.47621621621621624,
"calib/mu_w": 0.40736111111111106,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.028196078431372566,
"calib/std_conf": 0.1259565232595517,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1952.0,
"completions/max_terminated_length": 1952.0,
"completions/mean_length": 348.11328125,
"completions/mean_terminated_length": 348.11328125,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.008115312084555626,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0317,
"num_tokens": 12293662.0,
"reward": 1.556930422782898,
"reward_std": 0.4506745934486389,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.7560132741928101,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9092081785202026,
"step": 58
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5198412698412698,
"calib/avg_num_step_conf": 3.44140625,
"calib/ece": 0.09462745098039219,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": 0.01144333702473238,
"calib/mean_conf": 0.49627450980392157,
"calib/mu_c": 0.5020634920634921,
"calib/mu_w": 0.4906201550387597,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04839215686274512,
"calib/std_conf": 0.12670733275457785,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2108.0,
"completions/max_terminated_length": 2108.0,
"completions/mean_length": 342.8125,
"completions/mean_terminated_length": 342.8125,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.008273365907371044,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0249,
"num_tokens": 12487670.0,
"reward": 1.6497235298156738,
"reward_std": 0.4084717035293579,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7368066310882568,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9167747497558594,
"step": 59
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6054436480474542,
"calib/avg_num_step_conf": 3.33984375,
"calib/ece": 0.05843137254901969,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.042457983193277204,
"calib/mean_conf": 0.47113725490196084,
"calib/mu_c": 0.493781512605042,
"calib/mu_w": 0.4513235294117648,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.03145098039215689,
"calib/std_conf": 0.12651544424172143,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1848.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 357.58203125,
"completions/mean_terminated_length": 357.58203125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.064,
"grad_norm": 0.008037107065320015,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0368,
"num_tokens": 12688067.0,
"reward": 1.6102734804153442,
"reward_std": 0.514053225517273,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.749447226524353,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9182088375091553,
"step": 60
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5427588364091336,
"calib/avg_num_step_conf": 3.59375,
"calib/ece": 0.05295275590551177,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.008005630278385967,
"calib/mean_conf": 0.5275984251968504,
"calib/mu_c": 0.5312230215827338,
"calib/mu_w": 0.5232173913043479,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.016653543307086578,
"calib/std_conf": 0.1110656158365047,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1936.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 325.60546875,
"completions/mean_terminated_length": 325.60546875,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.00897640734910965,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0316,
"num_tokens": 12875486.0,
"reward": 1.7183257341384888,
"reward_std": 0.4480532705783844,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7368570566177368,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9020707607269287,
"step": 61
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6227478194435603,
"calib/avg_num_step_conf": 3.62109375,
"calib/ece": 0.09615079365079357,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04824027503660788,
"calib/mean_conf": 0.5332142857142858,
"calib/mu_c": 0.5598230088495575,
"calib/mu_w": 0.5115827338129496,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09047619047619039,
"calib/std_conf": 0.11200014931557913,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2175.0,
"completions/max_terminated_length": 2175.0,
"completions/mean_length": 385.828125,
"completions/mean_terminated_length": 385.828125,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.007599617820233107,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0473,
"num_tokens": 13081338.0,
"reward": 1.5627398490905762,
"reward_std": 0.5017718076705933,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7422398328781128,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8993446826934814,
"step": 62
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5769374068554396,
"calib/avg_num_step_conf": 4.07421875,
"calib/ece": 0.06889763779527565,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": 0.03668777943368107,
"calib/mean_conf": 0.5392125984251969,
"calib/mu_c": 0.5582786885245901,
"calib/mu_w": 0.521590909090909,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06389763779527564,
"calib/std_conf": 0.12531915978725527,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1769.0,
"completions/max_terminated_length": 1769.0,
"completions/mean_length": 407.23828125,
"completions/mean_terminated_length": 408.8353271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.0672,
"grad_norm": 0.0076432013884186745,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0064,
"num_tokens": 13294231.0,
"reward": 1.6152989864349365,
"reward_std": 0.4999202489852905,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7365000247955322,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8965712785720825,
"step": 63
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5838155626692932,
"calib/avg_num_step_conf": 3.890625,
"calib/ece": 0.04698039215686273,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.028400640236395103,
"calib/mean_conf": 0.540235294117647,
"calib/mu_c": 0.5540458015267177,
"calib/mu_w": 0.5256451612903226,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.036745098039215676,
"calib/std_conf": 0.11822188978743951,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2032.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 384.0390625,
"completions/mean_terminated_length": 384.0390625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.00764108169823885,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0416,
"num_tokens": 13496321.0,
"reward": 1.6733813285827637,
"reward_std": 0.5097801685333252,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7405202984809875,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9061299562454224,
"step": 64
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5989147286821705,
"calib/avg_num_step_conf": 3.94140625,
"calib/ece": 0.10870472440944882,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.044506356589147256,
"calib/mean_conf": 0.565476377952756,
"calib/mu_c": 0.58808,
"calib/mu_w": 0.5435736434108528,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09102755905511811,
"calib/std_conf": 0.11349812441246987,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2238.0,
"completions/max_terminated_length": 2238.0,
"completions/mean_length": 358.71484375,
"completions/mean_terminated_length": 358.71484375,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.008037811145186424,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0679,
"num_tokens": 13693176.0,
"reward": 1.6414910554885864,
"reward_std": 0.3687446713447571,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7481565475463867,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9037454128265381,
"step": 65
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5864916165770326,
"calib/avg_num_step_conf": 4.12890625,
"calib/ece": 0.13609842519685034,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": 0.033580196140461704,
"calib/mean_conf": 0.5574448818897638,
"calib/mu_c": 0.5766146788990825,
"calib/mu_w": 0.5430344827586208,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13220472440944878,
"calib/std_conf": 0.1300869292693095,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2106.0,
"completions/max_terminated_length": 2106.0,
"completions/mean_length": 418.9765625,
"completions/mean_terminated_length": 418.9765625,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.0704,
"grad_norm": 0.007899136282503605,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.0477,
"num_tokens": 13906786.0,
"reward": 1.5405974388122559,
"reward_std": 0.4160727858543396,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.729138195514679,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9020017385482788,
"step": 66
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5375972611266729,
"calib/avg_num_step_conf": 4.43359375,
"calib/ece": 0.08854330708661423,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": 0.013299097416744399,
"calib/mean_conf": 0.586732283464567,
"calib/mu_c": 0.5929629629629629,
"calib/mu_w": 0.5796638655462185,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07188976377952758,
"calib/std_conf": 0.12793003432418582,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2491.0,
"completions/max_terminated_length": 2491.0,
"completions/mean_length": 441.52734375,
"completions/mean_terminated_length": 441.52734375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.007587770000100136,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0066,
"num_tokens": 14124825.0,
"reward": 1.6739743947982788,
"reward_std": 0.5032845735549927,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7150827646255493,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8792521953582764,
"step": 67
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5175111773472428,
"calib/avg_num_step_conf": 4.24609375,
"calib/ece": 0.0871653543307086,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.005992300049677213,
"calib/mean_conf": 0.5740157480314961,
"calib/mu_c": 0.5768939393939394,
"calib/mu_w": 0.5709016393442622,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07074803149606293,
"calib/std_conf": 0.11021996505570755,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2075.0,
"completions/max_terminated_length": 2075.0,
"completions/mean_length": 425.32421875,
"completions/mean_terminated_length": 425.32421875,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.007506520953029394,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0285,
"num_tokens": 14337796.0,
"reward": 1.6663700342178345,
"reward_std": 0.41317036747932434,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7226867079734802,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8881058096885681,
"step": 68
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5355238095238095,
"calib/avg_num_step_conf": 4.40625,
"calib/ece": 0.11617529880478081,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.00796812749003984,
"calib/gap": 0.016096507936508075,
"calib/mean_conf": 0.5799203187250996,
"calib/mu_c": 0.5879365079365081,
"calib/mu_w": 0.57184,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0970517928286852,
"calib/std_conf": 0.1241480674633258,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2173.0,
"completions/max_terminated_length": 2173.0,
"completions/mean_length": 487.3203125,
"completions/mean_terminated_length": 487.3203125,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.0736,
"grad_norm": 0.007550784852355719,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0568,
"num_tokens": 14567046.0,
"reward": 1.6235175132751465,
"reward_std": 0.47181078791618347,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7165261507034302,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.879106342792511,
"step": 69
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5996880509968807,
"calib/avg_num_step_conf": 4.359375,
"calib/ece": 0.16271255060728745,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.008097165991902834,
"calib/gap": 0.043323613183236076,
"calib/mean_conf": 0.5716194331983806,
"calib/mu_c": 0.5972277227722772,
"calib/mu_w": 0.5539041095890411,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16271255060728745,
"calib/std_conf": 0.11653485910890778,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 522.46484375,
"completions/mean_terminated_length": 522.46484375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.0067598153837025166,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0411,
"num_tokens": 14807789.0,
"reward": 1.452558159828186,
"reward_std": 0.5075742602348328,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.69819176197052,
"rewards/format_reward_step": 0.9453125,
"rewards/stepwise_brier_reward": 0.8542283773422241,
"step": 70
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5672232855917101,
"calib/avg_num_step_conf": 4.79296875,
"calib/ece": 0.16580645161290325,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.03225806451612903,
"calib/gap": 0.024713880271929334,
"calib/mean_conf": 0.6053225806451613,
"calib/mu_c": 0.6191743119266055,
"calib/mu_w": 0.5944604316546762,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.16580645161290325,
"calib/std_conf": 0.12702828922845497,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2405.0,
"completions/max_terminated_length": 2405.0,
"completions/mean_length": 526.90625,
"completions/mean_terminated_length": 528.9725952148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.006880332250148058,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0742,
"num_tokens": 15047085.0,
"reward": 1.5022592544555664,
"reward_std": 0.5741759538650513,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.6890460848808289,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.8590537309646606,
"step": 71
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5645356234096692,
"calib/avg_num_step_conf": 4.21875,
"calib/ece": 0.11254980079681268,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.01195219123505976,
"calib/gap": 0.03358651399491086,
"calib/mean_conf": 0.5906374501992032,
"calib/mu_c": 0.6081666666666666,
"calib/mu_w": 0.5745801526717558,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11254980079681268,
"calib/std_conf": 0.1300735006432676,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 491.3984375,
"completions/mean_terminated_length": 495.2677001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.0768,
"grad_norm": 0.007533502299338579,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0568,
"num_tokens": 15277291.0,
"reward": 1.5845632553100586,
"reward_std": 0.4872770607471466,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7165089845657349,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8717440366744995,
"step": 72
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5071045743487474,
"calib/avg_num_step_conf": 4.3984375,
"calib/ece": 0.14078431372549027,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.0045431883335410594,
"calib/mean_conf": 0.6023529411764705,
"calib/mu_c": 0.6043661971830986,
"calib/mu_w": 0.5998230088495575,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0931372549019608,
"calib/std_conf": 0.11657848042924825,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2376.0,
"completions/max_terminated_length": 2376.0,
"completions/mean_length": 485.21875,
"completions/mean_terminated_length": 485.21875,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.007058590184897184,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0067,
"num_tokens": 15508539.0,
"reward": 1.7351597547531128,
"reward_std": 0.43965622782707214,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7347210645675659,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8934180736541748,
"step": 73
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.528050896471949,
"calib/avg_num_step_conf": 4.24609375,
"calib/ece": 0.12879999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.004,
"calib/gap": 0.012280701754386003,
"calib/mean_conf": 0.5968000000000001,
"calib/mu_c": 0.6033333333333334,
"calib/mu_w": 0.5910526315789474,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12879999999999994,
"calib/std_conf": 0.11661286378440416,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1897.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 489.35546875,
"completions/mean_terminated_length": 493.2086486816406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.00773122813552618,
"learning_rate": 3.5e-06,
"loss": 0.0199,
"num_tokens": 15737742.0,
"reward": 1.5563820600509644,
"reward_std": 0.42694351077079773,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6976984739303589,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8637673854827881,
"step": 74
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5132485029940119,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.11716599190283394,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.020242914979757085,
"calib/gap": 0.00283532934131725,
"calib/mean_conf": 0.6114170040485829,
"calib/mu_c": 0.6123353293413173,
"calib/mu_w": 0.6095,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.026234817813765153,
"calib/std_conf": 0.13058268741825027,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2520.0,
"completions/max_terminated_length": 2520.0,
"completions/mean_length": 512.6015625,
"completions/mean_terminated_length": 512.6015625,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.08,
"grad_norm": 0.007090203929692507,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0481,
"num_tokens": 15973720.0,
"reward": 1.843395709991455,
"reward_std": 0.4259081482887268,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7201793193817139,
"rewards/format_reward_step": 0.9453125,
"rewards/stepwise_brier_reward": 0.848715603351593,
"step": 75
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6342339478703115,
"calib/avg_num_step_conf": 4.2734375,
"calib/ece": 0.0597233201581028,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.011857707509881422,
"calib/gap": 0.05968531468531457,
"calib/mean_conf": 0.59600790513834,
"calib/mu_c": 0.6219580419580419,
"calib/mu_w": 0.5622727272727274,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.045256916996047475,
"calib/std_conf": 0.12177975298835207,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2387.0,
"completions/max_terminated_length": 2387.0,
"completions/mean_length": 543.16796875,
"completions/mean_terminated_length": 543.16796875,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.007007664535194635,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0547,
"num_tokens": 16215827.0,
"reward": 1.7331316471099854,
"reward_std": 0.4490346610546112,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.750145673751831,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.877693772315979,
"step": 76
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5592497868712702,
"calib/avg_num_step_conf": 4.8203125,
"calib/ece": 0.10116666666666665,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.0625,
"calib/gap": 0.03336743393009378,
"calib/mean_conf": 0.6318333333333332,
"calib/mu_c": 0.6460144927536232,
"calib/mu_w": 0.6126470588235294,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.07900000000000001,
"calib/std_conf": 0.14462297727386045,
"calib/step_conf_rate": 0.97265625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 559.16796875,
"completions/mean_terminated_length": 565.7984619140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.007006470579653978,
"learning_rate": 3.416666666666667e-06,
"loss": 0.1127,
"num_tokens": 16463638.0,
"reward": 1.654437780380249,
"reward_std": 0.4745751917362213,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6986671686172485,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.8253339529037476,
"step": 77
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5848951507208388,
"calib/avg_num_step_conf": 4.58203125,
"calib/ece": 0.06309036144578307,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.028112449799196786,
"calib/gap": 0.036206716906946323,
"calib/mean_conf": 0.6094397590361446,
"calib/mu_c": 0.6252892857142858,
"calib/mu_w": 0.5890825688073394,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.055140562248995946,
"calib/std_conf": 0.13684808357244896,
"calib/step_conf_rate": 0.97265625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2823.0,
"completions/max_terminated_length": 2823.0,
"completions/mean_length": 601.453125,
"completions/mean_terminated_length": 601.453125,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"epoch": 0.0832,
"grad_norm": 0.006692703813314438,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0732,
"num_tokens": 16725634.0,
"reward": 1.694908618927002,
"reward_std": 0.4652491807937622,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7221781015396118,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8543316125869751,
"step": 78
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.584365741987611,
"calib/avg_num_step_conf": 4.5546875,
"calib/ece": 0.06428571428571424,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.03968253968253968,
"calib/gap": 0.04562079181255041,
"calib/mean_conf": 0.6375396825396825,
"calib/mu_c": 0.6545569620253164,
"calib/mu_w": 0.608936170212766,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.037420634920634875,
"calib/std_conf": 0.13801130910606224,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1469.0,
"completions/max_terminated_length": 1469.0,
"completions/mean_length": 541.921875,
"completions/mean_terminated_length": 546.18896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.006624994333833456,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0001,
"num_tokens": 16970742.0,
"reward": 1.8090747594833374,
"reward_std": 0.46922507882118225,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7410902976989746,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.846771240234375,
"step": 79
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6140056022408963,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.08090551181102364,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.09055118110236221,
"calib/gap": 0.05761624649859942,
"calib/mean_conf": 0.6861811023622046,
"calib/mu_c": 0.7052352941176471,
"calib/mu_w": 0.6476190476190476,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.04889763779527561,
"calib/std_conf": 0.1526510568410527,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 512.46875,
"completions/mean_terminated_length": 514.4784545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.007300146389752626,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0112,
"num_tokens": 17204094.0,
"reward": 1.8976236581802368,
"reward_std": 0.3876197636127472,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7715495824813843,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8580073714256287,
"step": 80
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5120606020333577,
"calib/avg_num_step_conf": 4.8203125,
"calib/ece": 0.10596000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.056,
"calib/gap": -0.0010200013289919374,
"calib/mean_conf": 0.6486000000000001,
"calib/mu_c": 0.6481879194630873,
"calib/mu_w": 0.6492079207920792,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.07928000000000007,
"calib/std_conf": 0.13323978384851873,
"calib/step_conf_rate": 0.9765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2699.0,
"completions/max_terminated_length": 2699.0,
"completions/mean_length": 593.37890625,
"completions/mean_terminated_length": 593.37890625,
"completions/min_length": 255.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.0864,
"grad_norm": 0.006872696802020073,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0943,
"num_tokens": 17462247.0,
"reward": 1.7374032735824585,
"reward_std": 0.5122434496879578,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7063796520233154,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.8369834423065186,
"step": 81
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4951629658485911,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.09696047430830036,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.02766798418972332,
"calib/gap": 0.0017204908453447576,
"calib/mean_conf": 0.6391739130434783,
"calib/mu_c": 0.6398675496688742,
"calib/mu_w": 0.6381470588235294,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.06964822134387352,
"calib/std_conf": 0.12718397784570376,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1997.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 513.69921875,
"completions/mean_terminated_length": 515.7137451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.007474968209862709,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0468,
"num_tokens": 17699306.0,
"reward": 1.7570503950119019,
"reward_std": 0.4566406011581421,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7145159244537354,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.852747917175293,
"step": 82
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5002272727272726,
"calib/avg_num_step_conf": 4.76953125,
"calib/ece": 0.147916,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.052,
"calib/gap": 0.0014292207792204747,
"calib/mean_conf": 0.652164,
"calib/mu_c": 0.6527928571428571,
"calib/mu_w": 0.6513636363636366,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.12004000000000001,
"calib/std_conf": 0.14608970225173298,
"calib/step_conf_rate": 0.96875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 565.4765625,
"completions/mean_terminated_length": 565.4765625,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.006757485214620829,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0716,
"num_tokens": 17951332.0,
"reward": 1.6795960664749146,
"reward_std": 0.4769311249256134,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6928104758262634,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.8380734324455261,
"step": 83
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6134135060129509,
"calib/avg_num_step_conf": 4.59375,
"calib/ece": 0.10415820312500004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0234375,
"calib/gap": 0.050948041936478505,
"calib/mean_conf": 0.604669921875,
"calib/mu_c": 0.6275567375886525,
"calib/mu_w": 0.576608695652174,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07902343750000004,
"calib/std_conf": 0.14945810134028834,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1380.0,
"completions/max_terminated_length": 1380.0,
"completions/mean_length": 497.9765625,
"completions/mean_terminated_length": 499.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0896,
"grad_norm": 0.007209381554275751,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.0137,
"num_tokens": 18184734.0,
"reward": 1.7335989475250244,
"reward_std": 0.4257844388484955,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7492986917495728,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8882222175598145,
"step": 84
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5502873563218391,
"calib/avg_num_step_conf": 5.04296875,
"calib/ece": 0.1505458167330678,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.05179282868525897,
"calib/gap": 0.04549584929757344,
"calib/mean_conf": 0.659788844621514,
"calib/mu_c": 0.6808148148148148,
"calib/mu_w": 0.6353189655172413,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1362430278884463,
"calib/std_conf": 0.15400511334146827,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 549.375,
"completions/mean_terminated_length": 553.7008056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.007047437597066164,
"learning_rate": 3.1944444444444443e-06,
"loss": 0.0244,
"num_tokens": 18433198.0,
"reward": 1.6656556129455566,
"reward_std": 0.42222103476524353,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7122802138328552,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8487800359725952,
"step": 85
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5687830687830687,
"calib/avg_num_step_conf": 5.08984375,
"calib/ece": 0.1667450980392158,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.043137254901960784,
"calib/gap": 0.03538252122554442,
"calib/mean_conf": 0.6458823529411765,
"calib/mu_c": 0.663781746031746,
"calib/mu_w": 0.6283992248062016,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15925490196078443,
"calib/std_conf": 0.1644900732518082,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1466.0,
"completions/max_terminated_length": 1466.0,
"completions/mean_length": 530.42578125,
"completions/mean_terminated_length": 532.5059204101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.008088787086308002,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.01,
"num_tokens": 18674499.0,
"reward": 1.6263952255249023,
"reward_std": 0.4225319027900696,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7082681655883789,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8676250576972961,
"step": 86
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5656147316538884,
"calib/avg_num_step_conf": 4.8046875,
"calib/ece": 0.10925196850393704,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.015748031496062992,
"calib/gap": 0.021319824753559535,
"calib/mean_conf": 0.6248425196850393,
"calib/mu_c": 0.6322289156626505,
"calib/mu_w": 0.610909090909091,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04027559055118113,
"calib/std_conf": 0.1470329078381842,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2272.0,
"completions/max_terminated_length": 2272.0,
"completions/mean_length": 491.3046875,
"completions/mean_terminated_length": 493.2314147949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0928,
"grad_norm": 0.008023228496313095,
"learning_rate": 3.138888888888889e-06,
"loss": 0.0066,
"num_tokens": 18905769.0,
"reward": 1.877850890159607,
"reward_std": 0.45576608180999756,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7548441290855408,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8815594911575317,
"step": 87
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5912598026815076,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.1577380952380954,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.09126984126984126,
"calib/gap": 0.05075006324310638,
"calib/mean_conf": 0.6775793650793651,
"calib/mu_c": 0.7013432835820895,
"calib/mu_w": 0.6505932203389831,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1517857142857144,
"calib/std_conf": 0.1594935363757059,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2424.0,
"completions/max_terminated_length": 2424.0,
"completions/mean_length": 540.6640625,
"completions/mean_terminated_length": 542.7843627929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.006685222499072552,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0279,
"num_tokens": 19154027.0,
"reward": 1.6725934743881226,
"reward_std": 0.4166548252105713,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7181754112243652,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8628235459327698,
"step": 88
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6243885613947071,
"calib/avg_num_step_conf": 5.3515625,
"calib/ece": 0.1483280632411068,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.07114624505928854,
"calib/gap": 0.058687256992349335,
"calib/mean_conf": 0.6623438735177865,
"calib/mu_c": 0.6899477611940299,
"calib/mu_w": 0.6312605042016806,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14051383399209497,
"calib/std_conf": 0.1611301070417088,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2196.0,
"completions/max_terminated_length": 2196.0,
"completions/mean_length": 560.203125,
"completions/mean_terminated_length": 560.203125,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.0073220268823206425,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.062,
"num_tokens": 19406327.0,
"reward": 1.6657295227050781,
"reward_std": 0.4641486406326294,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.717314600944519,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8518532514572144,
"step": 89
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5211285834465799,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.15087795275590554,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.1141732283464567,
"calib/gap": 0.004715977480100997,
"calib/mean_conf": 0.7173110236220472,
"calib/mu_c": 0.719186274509804,
"calib/mu_w": 0.714470297029703,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1329133858267717,
"calib/std_conf": 0.15895752823943748,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2126.0,
"completions/max_terminated_length": 2126.0,
"completions/mean_length": 525.265625,
"completions/mean_terminated_length": 527.3255004882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.096,
"grad_norm": 0.007360328454524279,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0176,
"num_tokens": 19644115.0,
"reward": 1.7847692966461182,
"reward_std": 0.4164116084575653,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7149360775947571,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8616410493850708,
"step": 90
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5034208640125106,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.1553174603174603,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0873015873015873,
"calib/gap": 0.004256206424708364,
"calib/mean_conf": 0.6987301587301588,
"calib/mu_c": 0.7004697986577182,
"calib/mu_w": 0.6962135922330098,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1313888888888889,
"calib/std_conf": 0.14236095486966704,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1953.0,
"completions/max_terminated_length": 1953.0,
"completions/mean_length": 542.41015625,
"completions/mean_terminated_length": 546.6810913085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.007123507093638182,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0054,
"num_tokens": 19890684.0,
"reward": 1.752280592918396,
"reward_std": 0.5156278014183044,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7117398977279663,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8520700931549072,
"step": 91
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6385819227084,
"calib/avg_num_step_conf": 5.05078125,
"calib/ece": 0.127392578125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.1015625,
"calib/gap": 0.07632178217821772,
"calib/mean_conf": 0.698388671875,
"calib/mu_c": 0.7284999999999999,
"calib/mu_w": 0.6521782178217822,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11015625000000001,
"calib/std_conf": 0.1656455361932101,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 968.0,
"completions/max_terminated_length": 968.0,
"completions/mean_length": 472.9296875,
"completions/mean_terminated_length": 474.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.008139155805110931,
"learning_rate": 3e-06,
"loss": -0.0132,
"num_tokens": 20118474.0,
"reward": 1.8182064294815063,
"reward_std": 0.4615139067173004,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7615140676498413,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.8784990310668945,
"step": 92
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4482542860718308,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.26301581027667986,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.17786561264822134,
"calib/gap": -0.02694550118883754,
"calib/mean_conf": 0.7468102766798418,
"calib/mu_c": 0.7338167938931297,
"calib/mu_w": 0.7607622950819672,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24601976284584984,
"calib/std_conf": 0.15431212777926867,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 551.3984375,
"completions/mean_terminated_length": 553.560791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0992,
"grad_norm": 0.006671508774161339,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0129,
"num_tokens": 20365408.0,
"reward": 1.6373653411865234,
"reward_std": 0.500267744064331,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6528552174568176,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8497312068939209,
"step": 93
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6566200554295792,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.13701960784313733,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.09803921568627451,
"calib/gap": 0.08411564625850343,
"calib/mean_conf": 0.7134901960784313,
"calib/mu_c": 0.7491156462585035,
"calib/mu_w": 0.665,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13701960784313733,
"calib/std_conf": 0.1480195234273095,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2389.0,
"completions/max_terminated_length": 2389.0,
"completions/mean_length": 501.3203125,
"completions/mean_terminated_length": 501.3203125,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.007690585218369961,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0537,
"num_tokens": 20602426.0,
"reward": 1.7664459943771362,
"reward_std": 0.451269268989563,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7532836198806763,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8750002980232239,
"step": 94
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5667896917896917,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.125321568627451,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.10588235294117647,
"calib/gap": 0.03936072261072254,
"calib/mean_conf": 0.7037764705882352,
"calib/mu_c": 0.7190576923076923,
"calib/mu_w": 0.6796969696969698,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10866666666666666,
"calib/std_conf": 0.15970846305646103,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1189.0,
"completions/max_terminated_length": 1189.0,
"completions/mean_length": 489.39453125,
"completions/mean_terminated_length": 491.3137512207031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.007843728177249432,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0273,
"num_tokens": 20833839.0,
"reward": 1.8181705474853516,
"reward_std": 0.46654027700424194,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7442966103553772,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.879948079586029,
"step": 95
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7065189132978031,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.1158254901960784,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.10196078431372549,
"calib/gap": 0.100928181463668,
"calib/mean_conf": 0.7160176470588235,
"calib/mu_c": 0.7548057324840763,
"calib/mu_w": 0.6538775510204083,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.108078431372549,
"calib/std_conf": 0.15414507007931777,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 977.0,
"completions/max_terminated_length": 977.0,
"completions/mean_length": 466.3515625,
"completions/mean_terminated_length": 468.180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.1024,
"grad_norm": 0.008034911006689072,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0085,
"num_tokens": 21059041.0,
"reward": 1.829664707183838,
"reward_std": 0.4343709945678711,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7742824554443359,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8725016117095947,
"step": 96
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.46652902902902904,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.18572549019607837,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0784313725490196,
"calib/gap": -0.021219969969969887,
"calib/mean_conf": 0.7090980392156863,
"calib/mu_c": 0.6998611111111112,
"calib/mu_w": 0.721081081081081,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1650588235294117,
"calib/std_conf": 0.14351187078070876,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 488.76953125,
"completions/mean_terminated_length": 488.76953125,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.0073645696975290775,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0151,
"num_tokens": 21289238.0,
"reward": 1.7355914115905762,
"reward_std": 0.4348849058151245,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6995663642883301,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8756111860275269,
"step": 97
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.574774416135881,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.11916996047430833,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.09486166007905138,
"calib/gap": 0.0338010881104035,
"calib/mean_conf": 0.6954545454545455,
"calib/mu_c": 0.7082802547770701,
"calib/mu_w": 0.6744791666666666,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0970355731225297,
"calib/std_conf": 0.15102904894616762,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2311.0,
"completions/max_terminated_length": 2311.0,
"completions/mean_length": 496.5703125,
"completions/mean_terminated_length": 496.5703125,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.007983372546732426,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0391,
"num_tokens": 21522544.0,
"reward": 1.8186938762664795,
"reward_std": 0.41583630442619324,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7432183623313904,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8753073215484619,
"step": 98
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5388136942675158,
"calib/avg_num_step_conf": 5.1015625,
"calib/ece": 0.3299209486166008,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.11857707509881422,
"calib/gap": 0.02430135350318463,
"calib/mean_conf": 0.6964822134387351,
"calib/mu_c": 0.7115625,
"calib/mu_w": 0.6872611464968154,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3234782608695652,
"calib/std_conf": 0.16562934439031005,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 557.22265625,
"completions/mean_terminated_length": 559.4078979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.1056,
"grad_norm": 0.006918633822351694,
"learning_rate": 2.805555555555556e-06,
"loss": 0.0349,
"num_tokens": 21770993.0,
"reward": 1.4331088066101074,
"reward_std": 0.43188124895095825,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.6404386758804321,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8654338121414185,
"step": 99
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.63125,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.1452734375000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.05859375,
"calib/gap": 0.06177586206896557,
"calib/mean_conf": 0.6855078125,
"calib/mu_c": 0.7135000000000001,
"calib/mu_w": 0.6517241379310346,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1419531250000001,
"calib/std_conf": 0.1442292823127982,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1305.0,
"completions/max_terminated_length": 1305.0,
"completions/mean_length": 484.73828125,
"completions/mean_terminated_length": 486.6392517089844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.0076471734791994095,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0295,
"num_tokens": 22002494.0,
"reward": 1.7281349897384644,
"reward_std": 0.4057810306549072,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7427926063537598,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.8884974718093872,
"step": 100
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6494941086065573,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.18779000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.076,
"calib/gap": 0.07791880122950812,
"calib/mean_conf": 0.6602899999999999,
"calib/mu_c": 0.7001844262295082,
"calib/mu_w": 0.622265625,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1800400000000001,
"calib/std_conf": 0.16406870786350453,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 545.35546875,
"completions/mean_terminated_length": 545.35546875,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.006997666321694851,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0487,
"num_tokens": 22249097.0,
"reward": 1.6012694835662842,
"reward_std": 0.49606239795684814,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7136021256446838,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.8633507490158081,
"step": 101
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5767975988516247,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.07235294117647073,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.047058823529411764,
"calib/gap": 0.03954391230588539,
"calib/mean_conf": 0.6566666666666666,
"calib/mu_c": 0.6717088607594937,
"calib/mu_w": 0.6321649484536083,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05470588235294127,
"calib/std_conf": 0.15315334704934885,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2341.0,
"completions/max_terminated_length": 2341.0,
"completions/mean_length": 434.3828125,
"completions/mean_terminated_length": 434.3828125,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.1088,
"grad_norm": 0.007858687080442905,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0249,
"num_tokens": 22466995.0,
"reward": 1.8360176086425781,
"reward_std": 0.39576029777526855,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7551559209823608,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8936017155647278,
"step": 102
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6310987322320399,
"calib/avg_num_step_conf": 4.62109375,
"calib/ece": 0.1661354581673307,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.06374501992031872,
"calib/gap": 0.06998271225509023,
"calib/mean_conf": 0.6900398406374502,
"calib/mu_c": 0.7218248175182482,
"calib/mu_w": 0.651842105263158,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15517928286852592,
"calib/std_conf": 0.15271904263203304,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2632.0,
"completions/max_terminated_length": 2632.0,
"completions/mean_length": 533.00390625,
"completions/mean_terminated_length": 535.0941772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.006842796690762043,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0317,
"num_tokens": 22707996.0,
"reward": 1.700600028038025,
"reward_std": 0.4354501962661743,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7264589667320251,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8650033473968506,
"step": 103
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.626170238975117,
"calib/avg_num_step_conf": 4.49609375,
"calib/ece": 0.18858823529411767,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.023529411764705882,
"calib/gap": 0.04906873614190699,
"calib/mean_conf": 0.6709411764705883,
"calib/mu_c": 0.6963414634146342,
"calib/mu_w": 0.6472727272727272,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18858823529411767,
"calib/std_conf": 0.12849008686332702,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1251.0,
"completions/max_terminated_length": 1251.0,
"completions/mean_length": 468.984375,
"completions/mean_terminated_length": 470.82354736328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.007150310557335615,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0356,
"num_tokens": 22934736.0,
"reward": 1.622341275215149,
"reward_std": 0.35085827112197876,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.719916820526123,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8944482803344727,
"step": 104
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5058509612293449,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.15303149606299216,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.07086614173228346,
"calib/gap": 0.0013418633061145213,
"calib/mean_conf": 0.6801181102362205,
"calib/mu_c": 0.6806622516556291,
"calib/mu_w": 0.6793203883495146,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11933070866141737,
"calib/std_conf": 0.15448793313085318,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2178.0,
"completions/max_terminated_length": 2178.0,
"completions/mean_length": 506.734375,
"completions/mean_terminated_length": 508.7215881347656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.112,
"grad_norm": 0.007065868470817804,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0065,
"num_tokens": 23170220.0,
"reward": 1.7828137874603271,
"reward_std": 0.4391106963157654,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7226855158805847,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8851315975189209,
"step": 105
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5881953867028494,
"calib/avg_num_step_conf": 4.48828125,
"calib/ece": 0.16862745098039222,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.03529411764705882,
"calib/gap": 0.04682681633156538,
"calib/mean_conf": 0.6803921568627451,
"calib/mu_c": 0.7026119402985075,
"calib/mu_w": 0.6557851239669421,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.161764705882353,
"calib/std_conf": 0.13174783972353601,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1451.0,
"completions/max_terminated_length": 1451.0,
"completions/mean_length": 461.94921875,
"completions/mean_terminated_length": 463.76080322265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.007637795992195606,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0118,
"num_tokens": 23393063.0,
"reward": 1.6857733726501465,
"reward_std": 0.3476555049419403,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7268586158752441,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8912352919578552,
"step": 106
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.4945313002637844,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.16032421875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.06640625,
"calib/gap": -0.008823972206137842,
"calib/mean_conf": 0.6870429687499999,
"calib/mu_c": 0.6836305732484077,
"calib/mu_w": 0.6924545454545455,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11704296874999999,
"calib/std_conf": 0.14601597899694582,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1291.0,
"completions/max_terminated_length": 1291.0,
"completions/mean_length": 458.0703125,
"completions/mean_terminated_length": 459.86669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.008233068510890007,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0101,
"num_tokens": 23614945.0,
"reward": 1.8233089447021484,
"reward_std": 0.4427195191383362,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7291610240936279,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8921996355056763,
"step": 107
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5618535655960806,
"calib/avg_num_step_conf": 4.6640625,
"calib/ece": 0.10795686274509793,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.050980392156862744,
"calib/gap": 0.024407185628742556,
"calib/mean_conf": 0.7209843137254901,
"calib/mu_c": 0.7294071856287425,
"calib/mu_w": 0.705,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08701960784313714,
"calib/std_conf": 0.13332304770015738,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1583.0,
"completions/max_terminated_length": 1583.0,
"completions/mean_length": 473.65234375,
"completions/mean_terminated_length": 475.50982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.1152,
"grad_norm": 0.006711322348564863,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0047,
"num_tokens": 23839432.0,
"reward": 1.8879101276397705,
"reward_std": 0.43467026948928833,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7599049806594849,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8854851722717285,
"step": 108
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.536313548710243,
"calib/avg_num_step_conf": 4.8515625,
"calib/ece": 0.20652173913043487,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.08695652173913043,
"calib/gap": 0.019758953168044102,
"calib/mean_conf": 0.7195652173913044,
"calib/mu_c": 0.7290151515151516,
"calib/mu_w": 0.7092561983471075,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20217391304347834,
"calib/std_conf": 0.14194933369207718,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2365.0,
"completions/max_terminated_length": 2365.0,
"completions/mean_length": 503.6015625,
"completions/mean_terminated_length": 505.5765075683594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.007117453962564468,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0267,
"num_tokens": 24072954.0,
"reward": 1.6607688665390015,
"reward_std": 0.3406231999397278,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6928331851959229,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8799298405647278,
"step": 109
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4970926636149944,
"calib/avg_num_step_conf": 4.3828125,
"calib/ece": 0.19239215686274508,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.047058823529411764,
"calib/gap": -0.001516763577879332,
"calib/mean_conf": 0.7174901960784315,
"calib/mu_c": 0.7167883211678833,
"calib/mu_w": 0.7183050847457626,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1863137254901961,
"calib/std_conf": 0.1232245475527072,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1399.0,
"completions/max_terminated_length": 1399.0,
"completions/mean_length": 449.171875,
"completions/mean_terminated_length": 450.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.007710859179496765,
"learning_rate": 2.5e-06,
"loss": -0.0073,
"num_tokens": 24292862.0,
"reward": 1.7000811100006104,
"reward_std": 0.5412262678146362,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7002187371253967,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.896980881690979,
"step": 110
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5235912893700787,
"calib/avg_num_step_conf": 4.56640625,
"calib/ece": 0.21290196078431373,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.03137254901960784,
"calib/gap": 0.014279035433070741,
"calib/mean_conf": 0.7105490196078431,
"calib/mu_c": 0.7177165354330707,
"calib/mu_w": 0.7034374999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2127058823529412,
"calib/std_conf": 0.12745353843586749,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2903.0,
"completions/max_terminated_length": 2903.0,
"completions/mean_length": 495.90625,
"completions/mean_terminated_length": 495.90625,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.1184,
"grad_norm": 0.007386922836303711,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0464,
"num_tokens": 24527222.0,
"reward": 1.637666940689087,
"reward_std": 0.4339905083179474,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6930207014083862,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8888974189758301,
"step": 111
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6325248756218905,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.16582677165354331,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.051181102362204724,
"calib/gap": 0.06633333333333347,
"calib/mean_conf": 0.6886614173228346,
"calib/mu_c": 0.7200000000000001,
"calib/mu_w": 0.6536666666666666,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16346456692913386,
"calib/std_conf": 0.14771947346943018,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1087.0,
"completions/max_terminated_length": 1087.0,
"completions/mean_length": 472.19140625,
"completions/mean_terminated_length": 475.9094543457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.007241616956889629,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0253,
"num_tokens": 24756023.0,
"reward": 1.6869521141052246,
"reward_std": 0.32682478427886963,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7303000688552856,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8925088047981262,
"step": 112
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6635638297872339,
"calib/avg_num_step_conf": 4.36328125,
"calib/ece": 0.15328063241106726,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.03162055335968379,
"calib/gap": 0.05764057750759888,
"calib/mean_conf": 0.7098023715415019,
"calib/mu_c": 0.7353191489361702,
"calib/mu_w": 0.6776785714285714,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15288537549407122,
"calib/std_conf": 0.10540511193881581,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2571.0,
"completions/max_terminated_length": 2571.0,
"completions/mean_length": 432.546875,
"completions/mean_terminated_length": 434.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.007901332341134548,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0609,
"num_tokens": 24971955.0,
"reward": 1.7260768413543701,
"reward_std": 0.5581738352775574,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7347862720489502,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8960837721824646,
"step": 113
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.52540299818102,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.15988281249999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0390625,
"calib/gap": 0.022055447531832084,
"calib/mean_conf": 0.7181640624999999,
"calib/mu_c": 0.7273825503355704,
"calib/mu_w": 0.7053271028037383,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1480078125,
"calib/std_conf": 0.1211893712686723,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1642.0,
"completions/max_terminated_length": 1642.0,
"completions/mean_length": 451.1015625,
"completions/mean_terminated_length": 452.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1216,
"grad_norm": 0.00733104208484292,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0409,
"num_tokens": 25192461.0,
"reward": 1.7813656330108643,
"reward_std": 0.3606231212615967,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7342410087585449,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.899033784866333,
"step": 114
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5454853412170485,
"calib/avg_num_step_conf": 4.5234375,
"calib/ece": 0.2209411764705882,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0392156862745098,
"calib/gap": 0.021579822616408117,
"calib/mean_conf": 0.7146666666666666,
"calib/mu_c": 0.7250757575757576,
"calib/mu_w": 0.7034959349593495,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2089803921568627,
"calib/std_conf": 0.12425695493481724,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2858.0,
"completions/max_terminated_length": 2858.0,
"completions/mean_length": 441.8359375,
"completions/mean_terminated_length": 441.8359375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.008461576886475086,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0394,
"num_tokens": 25410835.0,
"reward": 1.6725574731826782,
"reward_std": 0.4955918788909912,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7040703296661377,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9002221822738647,
"step": 115
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5469261778785588,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.1417254901960785,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.047058823529411764,
"calib/gap": 0.018904006046863375,
"calib/mean_conf": 0.7134901960784313,
"calib/mu_c": 0.7214965986394558,
"calib/mu_w": 0.7025925925925924,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1393725490196079,
"calib/std_conf": 0.12002930566161171,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 496.7265625,
"completions/mean_terminated_length": 496.7265625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.007390998303890228,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0516,
"num_tokens": 25642517.0,
"reward": 1.7620733976364136,
"reward_std": 0.4782768189907074,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7265383005142212,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8920679092407227,
"step": 116
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.453277587890625,
"calib/avg_num_step_conf": 4.71484375,
"calib/ece": 0.23218749999999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0234375,
"calib/gap": -0.016875000000000195,
"calib/mean_conf": 0.71875,
"calib/mu_c": 0.7103124999999999,
"calib/mu_w": 0.7271875000000001,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22546874999999997,
"calib/std_conf": 0.11779218989389748,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1274.0,
"completions/max_terminated_length": 1274.0,
"completions/mean_length": 459.79296875,
"completions/mean_terminated_length": 461.5960998535156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.1248,
"grad_norm": 0.007319636642932892,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0099,
"num_tokens": 25866824.0,
"reward": 1.637864351272583,
"reward_std": 0.47429442405700684,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6729999780654907,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8940824270248413,
"step": 117
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5011826953003423,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.19444881889763782,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.05905511811023622,
"calib/gap": 0.008971677559913016,
"calib/mean_conf": 0.7259448818897637,
"calib/mu_c": 0.7301481481481482,
"calib/mu_w": 0.7211764705882352,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19444881889763782,
"calib/std_conf": 0.13003214666657753,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 466.37890625,
"completions/mean_terminated_length": 468.2078857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.007337419781833887,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0379,
"num_tokens": 26090225.0,
"reward": 1.6856483221054077,
"reward_std": 0.3241842985153198,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.695266842842102,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8988887071609497,
"step": 118
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5068715642178911,
"calib/avg_num_step_conf": 4.6171875,
"calib/ece": 0.15708661417322833,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.023622047244094488,
"calib/gap": 0.00229010494752635,
"calib/mean_conf": 0.6996062992125984,
"calib/mu_c": 0.7006521739130435,
"calib/mu_w": 0.6983620689655171,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15669291338582675,
"calib/std_conf": 0.11311899925218569,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1358.0,
"completions/max_terminated_length": 1358.0,
"completions/mean_length": 512.45703125,
"completions/mean_terminated_length": 514.4666748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.007050854153931141,
"learning_rate": 2.25e-06,
"loss": 0.0115,
"num_tokens": 26326478.0,
"reward": 1.7035629749298096,
"reward_std": 0.5102789402008057,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7076945304870605,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8956202268600464,
"step": 119
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5821460527221419,
"calib/avg_num_step_conf": 4.53515625,
"calib/ece": 0.07598425196850389,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.015748031496062992,
"calib/gap": 0.04384885401610572,
"calib/mean_conf": 0.6977952755905512,
"calib/mu_c": 0.712814371257485,
"calib/mu_w": 0.6689655172413793,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05814960629921259,
"calib/std_conf": 0.12801275395260564,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1062.0,
"completions/max_terminated_length": 1062.0,
"completions/mean_length": 448.9453125,
"completions/mean_terminated_length": 452.4803161621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.128,
"grad_norm": 0.007677591405808926,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0064,
"num_tokens": 26548096.0,
"reward": 1.890402913093567,
"reward_std": 0.40941351652145386,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.767189085483551,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9037976861000061,
"step": 120
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4778687515390298,
"calib/avg_num_step_conf": 4.74609375,
"calib/ece": 0.23031372549019608,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.03137254901960784,
"calib/gap": -0.007226052696380103,
"calib/mean_conf": 0.6958039215686275,
"calib/mu_c": 0.6922900763358778,
"calib/mu_w": 0.6995161290322579,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20619607843137255,
"calib/std_conf": 0.12692764954853383,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2383.0,
"completions/max_terminated_length": 2383.0,
"completions/mean_length": 502.6171875,
"completions/mean_terminated_length": 502.6171875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.0072189816273748875,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0204,
"num_tokens": 26781822.0,
"reward": 1.6658300161361694,
"reward_std": 0.4628363847732544,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6945909857749939,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9062290191650391,
"step": 121
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6797278121775026,
"calib/avg_num_step_conf": 4.54296875,
"calib/ece": 0.0907086614173228,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.01968503937007874,
"calib/gap": 0.0847265221878224,
"calib/mean_conf": 0.6891338582677166,
"calib/mu_c": 0.723157894736842,
"calib/mu_w": 0.6384313725490196,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0907086614173228,
"calib/std_conf": 0.12751423340066567,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2624.0,
"completions/max_terminated_length": 2624.0,
"completions/mean_length": 472.09375,
"completions/mean_terminated_length": 473.94512939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.007661870680749416,
"learning_rate": 2.166666666666667e-06,
"loss": 0.0113,
"num_tokens": 27010022.0,
"reward": 1.8057467937469482,
"reward_std": 0.43043383955955505,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7698593735694885,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9062525033950806,
"step": 122
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5482461538461539,
"calib/avg_num_step_conf": 4.3515625,
"calib/ece": 0.20866666666666667,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0196078431372549,
"calib/gap": 0.01661538461538481,
"calib/mean_conf": 0.7184705882352941,
"calib/mu_c": 0.7266153846153847,
"calib/mu_w": 0.7099999999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.20866666666666667,
"calib/std_conf": 0.10755579545030615,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1879.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 501.75390625,
"completions/mean_terminated_length": 501.75390625,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.1312,
"grad_norm": 0.0073029338382184505,
"learning_rate": 2.138888888888889e-06,
"loss": 0.0064,
"num_tokens": 27243759.0,
"reward": 1.645280122756958,
"reward_std": 0.46035170555114746,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6940886974334717,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.8870315551757812,
"step": 123
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5546541132478631,
"calib/avg_num_step_conf": 4.45703125,
"calib/ece": 0.13226190476190472,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.011904761904761904,
"calib/gap": 0.01400641025641014,
"calib/mean_conf": 0.7140873015873015,
"calib/mu_c": 0.7194230769230768,
"calib/mu_w": 0.7054166666666667,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11365079365079361,
"calib/std_conf": 0.10809846480278325,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1067.0,
"completions/max_terminated_length": 1067.0,
"completions/mean_length": 476.91015625,
"completions/mean_terminated_length": 478.7804260253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.007507139816880226,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0072,
"num_tokens": 27472664.0,
"reward": 1.8107237815856934,
"reward_std": 0.401524156332016,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7354112863540649,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8902963399887085,
"step": 124
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4849450274862569,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.22370078740157487,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.03937007874015748,
"calib/gap": -0.007656171914042886,
"calib/mean_conf": 0.7120472440944883,
"calib/mu_c": 0.7085507246376812,
"calib/mu_w": 0.7162068965517241,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19622047244094498,
"calib/std_conf": 0.11846683699999311,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2106.0,
"completions/max_terminated_length": 2106.0,
"completions/mean_length": 504.73046875,
"completions/mean_terminated_length": 504.73046875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.006698730401694775,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0289,
"num_tokens": 27706683.0,
"reward": 1.7050342559814453,
"reward_std": 0.5200086832046509,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.700056254863739,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.901330828666687,
"step": 125
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6516716754320061,
"calib/avg_num_step_conf": 4.4921875,
"calib/ece": 0.19229249011857708,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.05138339920948617,
"calib/gap": 0.06253443526170799,
"calib/mean_conf": 0.7140316205533596,
"calib/mu_c": 0.743939393939394,
"calib/mu_w": 0.681404958677686,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19229249011857708,
"calib/std_conf": 0.12270804302194291,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2901.0,
"completions/max_terminated_length": 2901.0,
"completions/mean_length": 479.03515625,
"completions/mean_terminated_length": 480.91375732421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.1344,
"grad_norm": 0.006964969914406538,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0102,
"num_tokens": 27934780.0,
"reward": 1.6716426610946655,
"reward_std": 0.4074101448059082,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7178597450256348,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8905860185623169,
"step": 126
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5591517857142858,
"calib/avg_num_step_conf": 4.37109375,
"calib/ece": 0.18771653543307087,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": 0.020229414682539915,
"calib/mean_conf": 0.675511811023622,
"calib/mu_c": 0.685546875,
"calib/mu_w": 0.6653174603174601,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17964566929133857,
"calib/std_conf": 0.12615893401639564,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2581.0,
"completions/max_terminated_length": 2581.0,
"completions/mean_length": 468.953125,
"completions/mean_terminated_length": 468.953125,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.00796822365373373,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0523,
"num_tokens": 28158504.0,
"reward": 1.647589087486267,
"reward_std": 0.4065118432044983,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7053730487823486,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.908420741558075,
"step": 127
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.603496062992126,
"calib/avg_num_step_conf": 4.27734375,
"calib/ece": 0.14055555555555554,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04899653543307103,
"calib/mean_conf": 0.6365873015873017,
"calib/mu_c": 0.66128,
"calib/mu_w": 0.612283464566929,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14055555555555554,
"calib/std_conf": 0.13243844151578976,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2386.0,
"completions/max_terminated_length": 2386.0,
"completions/mean_length": 498.8359375,
"completions/mean_terminated_length": 502.7637634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.007659838069230318,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.017,
"num_tokens": 28392870.0,
"reward": 1.639002799987793,
"reward_std": 0.5024175643920898,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7256976366043091,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9084383845329285,
"step": 128
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5661844025636146,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.100234375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0390625,
"calib/gap": 0.03288533536391913,
"calib/mean_conf": 0.690625,
"calib/mu_c": 0.7038562091503268,
"calib/mu_w": 0.6709708737864076,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09660156250000002,
"calib/std_conf": 0.13231561463032246,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 999.0,
"completions/max_terminated_length": 999.0,
"completions/mean_length": 441.421875,
"completions/mean_terminated_length": 443.1529541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.1376,
"grad_norm": 0.007797915954142809,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0022,
"num_tokens": 28608258.0,
"reward": 1.8130768537521362,
"reward_std": 0.4100300967693329,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7492015361785889,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.917168378829956,
"step": 129
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5470162748643761,
"calib/avg_num_step_conf": 4.3203125,
"calib/ece": 0.13023437499999987,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.01171875,
"calib/gap": 0.03660036166365288,
"calib/mean_conf": 0.6768749999999999,
"calib/mu_c": 0.690886075949367,
"calib/mu_w": 0.6542857142857141,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09496093749999993,
"calib/std_conf": 0.1444669620536128,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 983.0,
"completions/max_terminated_length": 983.0,
"completions/mean_length": 440.83203125,
"completions/mean_terminated_length": 442.5608215332031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.008532690815627575,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0038,
"num_tokens": 28826399.0,
"reward": 1.841002106666565,
"reward_std": 0.30434200167655945,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7529324293136597,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9157634973526001,
"step": 130
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5894105894105894,
"calib/avg_num_step_conf": 4.2890625,
"calib/ece": 0.25133333333333335,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.011764705882352941,
"calib/gap": 0.04025786713286705,
"calib/mean_conf": 0.690549019607843,
"calib/mu_c": 0.7131249999999999,
"calib/mu_w": 0.6728671328671328,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25133333333333335,
"calib/std_conf": 0.11641917396918036,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2895.0,
"completions/max_terminated_length": 2895.0,
"completions/mean_length": 454.296875,
"completions/mean_terminated_length": 454.296875,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.007329374551773071,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0185,
"num_tokens": 29048907.0,
"reward": 1.5587986707687378,
"reward_std": 0.3782535791397095,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6940823793411255,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9239246845245361,
"step": 131
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5577631578947367,
"calib/avg_num_step_conf": 4.4765625,
"calib/ece": 0.08654901960784306,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.028427631578947454,
"calib/mean_conf": 0.7067843137254902,
"calib/mu_c": 0.717375,
"calib/mu_w": 0.6889473684210525,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08294117647058816,
"calib/std_conf": 0.11680993674584471,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1294.0,
"completions/max_terminated_length": 1294.0,
"completions/mean_length": 475.59375,
"completions/mean_terminated_length": 477.4588623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.1408,
"grad_norm": 0.0073455991223454475,
"learning_rate": 1.888888888888889e-06,
"loss": 0.025,
"num_tokens": 29276251.0,
"reward": 1.8585619926452637,
"reward_std": 0.4749313294887543,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7566285133361816,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9119948744773865,
"step": 132
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6084105441733003,
"calib/avg_num_step_conf": 4.28125,
"calib/ece": 0.33207843137254894,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0196078431372549,
"calib/gap": 0.04260015659663319,
"calib/mean_conf": 0.7124705882352942,
"calib/mu_c": 0.7388659793814433,
"calib/mu_w": 0.6962658227848101,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33207843137254894,
"calib/std_conf": 0.11152672289304398,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1927.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 513.99609375,
"completions/mean_terminated_length": 513.99609375,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.007127861492335796,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.0471,
"num_tokens": 29514178.0,
"reward": 1.45475435256958,
"reward_std": 0.47963637113571167,
"rewards/accuracy_reward_step": 0.37890625,
"rewards/final_brier_reward_step": 0.6553382873535156,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9058670401573181,
"step": 133
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5925682444277486,
"calib/avg_num_step_conf": 4.0078125,
"calib/ece": 0.2086166007905139,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.015810276679841896,
"calib/gap": 0.03940771349862249,
"calib/mean_conf": 0.6868774703557312,
"calib/mu_c": 0.7074380165289256,
"calib/mu_w": 0.6680303030303031,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2086166007905139,
"calib/std_conf": 0.13051181100794376,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2451.0,
"completions/max_terminated_length": 2451.0,
"completions/mean_length": 543.16015625,
"completions/mean_terminated_length": 543.16015625,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.006815467029809952,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0443,
"num_tokens": 29762179.0,
"reward": 1.5914236307144165,
"reward_std": 0.5106499195098877,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6931988596916199,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.8912457227706909,
"step": 134
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5301959325396826,
"calib/avg_num_step_conf": 4.05078125,
"calib/ece": 0.14402343750000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.015625,
"calib/gap": 0.011081349206349178,
"calib/mean_conf": 0.6891796875,
"calib/mu_c": 0.6940277777777778,
"calib/mu_w": 0.6829464285714286,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13535156250000002,
"calib/std_conf": 0.11849195209972002,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1428.0,
"completions/max_terminated_length": 1428.0,
"completions/mean_length": 493.9609375,
"completions/mean_terminated_length": 495.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.144,
"grad_norm": 0.007231003139168024,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0076,
"num_tokens": 29994513.0,
"reward": 1.7483104467391968,
"reward_std": 0.5153388977050781,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7237054705619812,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9101613163948059,
"step": 135
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5537567968363816,
"calib/avg_num_step_conf": 4.2578125,
"calib/ece": 0.24686274509803918,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.047058823529411764,
"calib/gap": 0.02223739495798316,
"calib/mean_conf": 0.7066274509803921,
"calib/mu_c": 0.7184873949579832,
"calib/mu_w": 0.69625,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24341176470588233,
"calib/std_conf": 0.1304366664413271,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 995.0,
"completions/max_terminated_length": 995.0,
"completions/mean_length": 454.2578125,
"completions/mean_terminated_length": 456.03924560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.007509256713092327,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0082,
"num_tokens": 30219291.0,
"reward": 1.5949571132659912,
"reward_std": 0.3979659080505371,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.684899628162384,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9136791229248047,
"step": 136
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6786153846153846,
"calib/avg_num_step_conf": 4.296875,
"calib/ece": 0.21956862745098038,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0196078431372549,
"calib/gap": 0.07330769230769252,
"calib/mean_conf": 0.7293725490196079,
"calib/mu_c": 0.7653076923076924,
"calib/mu_w": 0.6919999999999998,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.21956862745098038,
"calib/std_conf": 0.1153426201833364,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1272.0,
"completions/max_terminated_length": 1272.0,
"completions/mean_length": 454.46484375,
"completions/mean_terminated_length": 456.2471008300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.007360511925071478,
"learning_rate": 1.75e-06,
"loss": -0.0025,
"num_tokens": 30442618.0,
"reward": 1.6539958715438843,
"reward_std": 0.3815937042236328,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7103238105773926,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8978472948074341,
"step": 137
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6042407660738713,
"calib/avg_num_step_conf": 4.32421875,
"calib/ece": 0.08468749999999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0234375,
"calib/gap": 0.042098495212038345,
"calib/mean_conf": 0.7213281250000001,
"calib/mu_c": 0.7354705882352941,
"calib/mu_w": 0.6933720930232558,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07097656249999998,
"calib/std_conf": 0.12055452017234514,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1149.0,
"completions/max_terminated_length": 1149.0,
"completions/mean_length": 463.515625,
"completions/mean_terminated_length": 465.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.1472,
"grad_norm": 0.007477740757167339,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0101,
"num_tokens": 30665614.0,
"reward": 1.9196381568908691,
"reward_std": 0.473527193069458,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7778867483139038,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.9162907004356384,
"step": 138
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5980053942168978,
"calib/avg_num_step_conf": 4.10546875,
"calib/ece": 0.14054687499999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.01953125,
"calib/gap": 0.04040895690898805,
"calib/mean_conf": 0.7196874999999999,
"calib/mu_c": 0.7365771812080536,
"calib/mu_w": 0.6961682242990656,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13910156249999994,
"calib/std_conf": 0.11381136737492437,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1039.0,
"completions/max_terminated_length": 1039.0,
"completions/mean_length": 430.80859375,
"completions/mean_terminated_length": 432.4980773925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.007508372887969017,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0199,
"num_tokens": 30878997.0,
"reward": 1.7896010875701904,
"reward_std": 0.43053174018859863,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7444875240325928,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.9217291474342346,
"step": 139
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6574622188174664,
"calib/avg_num_step_conf": 3.95703125,
"calib/ece": 0.06691406249999986,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.015625,
"calib/gap": 0.06244097778396818,
"calib/mean_conf": 0.7247265625,
"calib/mu_c": 0.7449710982658959,
"calib/mu_w": 0.6825301204819277,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05792968749999987,
"calib/std_conf": 0.12489653310614186,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1250.0,
"completions/max_terminated_length": 1250.0,
"completions/mean_length": 458.50390625,
"completions/mean_terminated_length": 460.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.007245223503559828,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.016,
"num_tokens": 31101390.0,
"reward": 1.9366732835769653,
"reward_std": 0.3448790907859802,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7867851257324219,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9130330085754395,
"step": 140
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7461554164398476,
"calib/avg_num_step_conf": 4.16015625,
"calib/ece": 0.07121568627450975,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.050980392156862744,
"calib/gap": 0.12020073489384864,
"calib/mean_conf": 0.7213333333333333,
"calib/mu_c": 0.762814371257485,
"calib/mu_w": 0.6426136363636363,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06882352941176464,
"calib/std_conf": 0.1383763930148161,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1049.0,
"completions/max_terminated_length": 1049.0,
"completions/mean_length": 484.69140625,
"completions/mean_terminated_length": 486.5921936035156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.1504,
"grad_norm": 0.007165694609284401,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0072,
"num_tokens": 31332567.0,
"reward": 1.901097297668457,
"reward_std": 0.362898051738739,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7994171977043152,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.906534731388092,
"step": 141
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6126891590176136,
"calib/avg_num_step_conf": 4.09765625,
"calib/ece": 0.19619607843137254,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.047058823529411764,
"calib/gap": 0.052981890349789174,
"calib/mean_conf": 0.741294117647059,
"calib/mu_c": 0.7653956834532374,
"calib/mu_w": 0.7124137931034482,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19619607843137254,
"calib/std_conf": 0.11975912002074422,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1221.0,
"completions/max_terminated_length": 1221.0,
"completions/mean_length": 465.8828125,
"completions/mean_terminated_length": 467.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.007731846533715725,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0167,
"num_tokens": 31556993.0,
"reward": 1.7208367586135864,
"reward_std": 0.3604811429977417,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7226402759552002,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.910706639289856,
"step": 142
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5276494780132869,
"calib/avg_num_step_conf": 4.07421875,
"calib/ece": 0.17389763779527553,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.027559055118110236,
"calib/gap": 0.012524517557734871,
"calib/mean_conf": 0.7353149606299212,
"calib/mu_c": 0.7406896551724137,
"calib/mu_w": 0.7281651376146788,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16917322834645662,
"calib/std_conf": 0.11812719694494285,
"calib/step_conf_rate": 0.97265625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2122.0,
"completions/max_terminated_length": 2122.0,
"completions/mean_length": 498.69921875,
"completions/mean_terminated_length": 498.69921875,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.007055537775158882,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0326,
"num_tokens": 31791996.0,
"reward": 1.7306500673294067,
"reward_std": 0.4198153614997864,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7007426023483276,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8859198093414307,
"step": 143
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6412354620795321,
"calib/avg_num_step_conf": 4.11328125,
"calib/ece": 0.06804687499999994,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.03125,
"calib/gap": 0.06610348910091224,
"calib/mean_conf": 0.743828125,
"calib/mu_c": 0.7652601156069363,
"calib/mu_w": 0.6991566265060241,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06804687499999994,
"calib/std_conf": 0.12284467513483999,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1096.0,
"completions/max_terminated_length": 1096.0,
"completions/mean_length": 446.578125,
"completions/mean_terminated_length": 448.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.1536,
"grad_norm": 0.008307057432830334,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0113,
"num_tokens": 32010448.0,
"reward": 1.9410185813903809,
"reward_std": 0.3637969493865967,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7901445031166077,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.9192425012588501,
"step": 144
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5532927163198248,
"calib/avg_num_step_conf": 4.15625,
"calib/ece": 0.1580314960629921,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.031496062992125984,
"calib/gap": 0.010372398685651651,
"calib/mean_conf": 0.7649606299212598,
"calib/mu_c": 0.7685542168674698,
"calib/mu_w": 0.7581818181818182,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1347244094488189,
"calib/std_conf": 0.12377509458913,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1199.0,
"completions/max_terminated_length": 1199.0,
"completions/mean_length": 447.35546875,
"completions/mean_terminated_length": 449.1098327636719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.007844759151339531,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0049,
"num_tokens": 32227675.0,
"reward": 1.8811393976211548,
"reward_std": 0.4249289035797119,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7446749806404114,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9048829078674316,
"step": 145
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6414623494915466,
"calib/avg_num_step_conf": 3.875,
"calib/ece": 0.2798425196850393,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.047244094488188976,
"calib/gap": 0.0767708528292469,
"calib/mean_conf": 0.7404724409448818,
"calib/mu_c": 0.7818803418803417,
"calib/mu_w": 0.7051094890510948,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2798425196850393,
"calib/std_conf": 0.14709715182539784,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1064.0,
"completions/max_terminated_length": 1064.0,
"completions/mean_length": 485.48046875,
"completions/mean_terminated_length": 487.38433837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.008200140669941902,
"learning_rate": 1.5e-06,
"loss": 0.012,
"num_tokens": 32459174.0,
"reward": 1.5805511474609375,
"reward_std": 0.40894848108291626,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6843593120574951,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9112821817398071,
"step": 146
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6325720265944348,
"calib/avg_num_step_conf": 4.09765625,
"calib/ece": 0.2608627450980392,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.050980392156862744,
"calib/gap": 0.06026409751292794,
"calib/mean_conf": 0.7745882352941176,
"calib/mu_c": 0.8038931297709923,
"calib/mu_w": 0.7436290322580643,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2608627450980392,
"calib/std_conf": 0.11827405471384854,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2334.0,
"completions/max_terminated_length": 2334.0,
"completions/mean_length": 467.71875,
"completions/mean_terminated_length": 467.71875,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.1568,
"grad_norm": 0.008043180219829082,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.034,
"num_tokens": 32682590.0,
"reward": 1.6648333072662354,
"reward_std": 0.3625674545764923,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6928074359893799,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9118380546569824,
"step": 147
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6541204694287751,
"calib/avg_num_step_conf": 4.1015625,
"calib/ece": 0.15777343749999992,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.03515625,
"calib/gap": 0.07320171172923517,
"calib/mean_conf": 0.7788671875,
"calib/mu_c": 0.8066037735849053,
"calib/mu_w": 0.7334020618556701,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15777343749999992,
"calib/std_conf": 0.12352206730313349,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1144.0,
"completions/max_terminated_length": 1144.0,
"completions/mean_length": 451.91015625,
"completions/mean_terminated_length": 453.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.008227191865444183,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0054,
"num_tokens": 32903391.0,
"reward": 1.8450043201446533,
"reward_std": 0.4270305633544922,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.755149245262146,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9061184525489807,
"step": 148
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.507754846779237,
"calib/avg_num_step_conf": 4.0234375,
"calib/ece": 0.3051778656126481,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.08300395256916997,
"calib/gap": 0.011647904940587761,
"calib/mean_conf": 0.7860474308300396,
"calib/mu_c": 0.7920325203252031,
"calib/mu_w": 0.7803846153846153,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3025296442687746,
"calib/std_conf": 0.13556276248458035,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 502.640625,
"completions/mean_terminated_length": 504.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.007383955176919699,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.059,
"num_tokens": 33136523.0,
"reward": 1.6020516157150269,
"reward_std": 0.48702579736709595,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6401144862174988,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9087169170379639,
"step": 149
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6135663235197693,
"calib/avg_num_step_conf": 4.05078125,
"calib/ece": 0.20349206349206345,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.07936507936507936,
"calib/gap": 0.03701790046554343,
"calib/mean_conf": 0.8015873015873016,
"calib/mu_c": 0.8164238410596026,
"calib/mu_w": 0.7794059405940592,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.20293650793650791,
"calib/std_conf": 0.11184181501022798,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 460.48046875,
"completions/mean_terminated_length": 462.28631591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.16,
"grad_norm": 0.007063120137900114,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0085,
"num_tokens": 33359366.0,
"reward": 1.7778428792953491,
"reward_std": 0.44000566005706787,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7124598026275635,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8989115357398987,
"step": 150
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5533248081841432,
"calib/avg_num_step_conf": 3.765625,
"calib/ece": 0.3199203187250995,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.043824701195219126,
"calib/gap": 0.03450319693094617,
"calib/mean_conf": 0.7780876494023905,
"calib/mu_c": 0.7967826086956522,
"calib/mu_w": 0.762279411764706,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3199203187250995,
"calib/std_conf": 0.13798900111652393,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2223.0,
"completions/max_terminated_length": 2223.0,
"completions/mean_length": 480.609375,
"completions/mean_terminated_length": 482.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.007321125827729702,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0082,
"num_tokens": 33589426.0,
"reward": 1.5496189594268799,
"reward_std": 0.44075965881347656,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6348445415496826,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9073817133903503,
"step": 151
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.560031746031746,
"calib/avg_num_step_conf": 3.734375,
"calib/ece": 0.2831872509960158,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.07569721115537849,
"calib/gap": 0.03167873015873013,
"calib/mean_conf": 0.7843824701195219,
"calib/mu_c": 0.80015873015873,
"calib/mu_w": 0.7684799999999998,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2827888446215139,
"calib/std_conf": 0.1388538640055688,
"calib/step_conf_rate": 0.97265625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1180.0,
"completions/max_terminated_length": 1180.0,
"completions/mean_length": 469.15234375,
"completions/mean_terminated_length": 472.8464660644531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.007486861664801836,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.002,
"num_tokens": 33814921.0,
"reward": 1.5980477333068848,
"reward_std": 0.5343087911605835,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6394152045249939,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.8855881690979004,
"step": 152
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5712327893931668,
"calib/avg_num_step_conf": 3.8125,
"calib/ece": 0.22196850393700773,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0984251968503937,
"calib/gap": 0.03161142274349815,
"calib/mean_conf": 0.8046456692913385,
"calib/mu_c": 0.8178378378378378,
"calib/mu_w": 0.7862264150943397,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22196850393700773,
"calib/std_conf": 0.12096264325783446,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1086.0,
"completions/max_terminated_length": 1086.0,
"completions/mean_length": 467.07421875,
"completions/mean_terminated_length": 468.9059143066406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.1632,
"grad_norm": 0.007297352887690067,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0178,
"num_tokens": 34041812.0,
"reward": 1.7645387649536133,
"reward_std": 0.4808223247528076,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6989551186561584,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9138876795768738,
"step": 153
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5651244890375325,
"calib/avg_num_step_conf": 3.62890625,
"calib/ece": 0.32023529411764695,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.07058823529411765,
"calib/gap": 0.0416963953920475,
"calib/mean_conf": 0.7790588235294119,
"calib/mu_c": 0.8016239316239315,
"calib/mu_w": 0.759927536231884,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32023529411764695,
"calib/std_conf": 0.14244605607995456,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1115.0,
"completions/max_terminated_length": 1115.0,
"completions/mean_length": 441.66015625,
"completions/mean_terminated_length": 443.3921813964844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.008471459150314331,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0134,
"num_tokens": 34259317.0,
"reward": 1.5774880647659302,
"reward_std": 0.4608520269393921,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6470234394073486,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9285538196563721,
"step": 154
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5584041168082337,
"calib/avg_num_step_conf": 3.90625,
"calib/ece": 0.26677165354330695,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.05511811023622047,
"calib/gap": 0.026614173228346027,
"calib/mean_conf": 0.7585039370078739,
"calib/mu_c": 0.7718110236220469,
"calib/mu_w": 0.7451968503937009,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2626377952755904,
"calib/std_conf": 0.1507152435229073,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 967.0,
"completions/max_terminated_length": 967.0,
"completions/mean_length": 443.65625,
"completions/mean_terminated_length": 445.3961181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.0080947894603014,
"learning_rate": 1.25e-06,
"loss": 0.0004,
"num_tokens": 34480109.0,
"reward": 1.6442118883132935,
"reward_std": 0.44987744092941284,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6685038805007935,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9239685535430908,
"step": 155
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5989190548014077,
"calib/avg_num_step_conf": 4.12109375,
"calib/ece": 0.31901185770750984,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.09881422924901186,
"calib/gap": 0.07836915535444933,
"calib/mean_conf": 0.7814624505928853,
"calib/mu_c": 0.8235897435897434,
"calib/mu_w": 0.745220588235294,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.31901185770750984,
"calib/std_conf": 0.16042629589979623,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 470.83203125,
"completions/mean_terminated_length": 472.678466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1664,
"grad_norm": 0.007263018749654293,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0302,
"num_tokens": 34705402.0,
"reward": 1.5716431140899658,
"reward_std": 0.4974963068962097,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.655100405216217,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9127222299575806,
"step": 156
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6819584646048994,
"calib/avg_num_step_conf": 3.99609375,
"calib/ece": 0.18440944881889754,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.13385826771653545,
"calib/gap": 0.11536809618723087,
"calib/mean_conf": 0.7773228346456693,
"calib/mu_c": 0.8241059602649007,
"calib/mu_w": 0.7087378640776698,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18362204724409437,
"calib/std_conf": 0.1615717525152909,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 960.0,
"completions/max_terminated_length": 960.0,
"completions/mean_length": 445.63671875,
"completions/mean_terminated_length": 447.38433837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.007417924702167511,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.007,
"num_tokens": 34923213.0,
"reward": 1.7946419715881348,
"reward_std": 0.45173823833465576,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7452230453491211,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9177197217941284,
"step": 157
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5686170212765956,
"calib/avg_num_step_conf": 3.91796875,
"calib/ece": 0.20771653543307084,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2204724409448819,
"calib/gap": 0.05993617021276609,
"calib/mean_conf": 0.8288188976377951,
"calib/mu_c": 0.851,
"calib/mu_w": 0.7910638297872339,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20330708661417318,
"calib/std_conf": 0.13153014787629336,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 443.3828125,
"completions/mean_terminated_length": 445.12158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.007617747876793146,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0454,
"num_tokens": 35141959.0,
"reward": 1.8422807455062866,
"reward_std": 0.536354660987854,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7310367226600647,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9193363189697266,
"step": 158
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5075898967276347,
"calib/avg_num_step_conf": 3.84375,
"calib/ece": 0.22454901960784301,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.10196078431372549,
"calib/gap": 0.01061776782381485,
"calib/mean_conf": 0.7556078431372547,
"calib/mu_c": 0.760354609929078,
"calib/mu_w": 0.7497368421052631,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21360784313725475,
"calib/std_conf": 0.1636313501808703,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2553.0,
"completions/max_terminated_length": 2553.0,
"completions/mean_length": 442.62109375,
"completions/mean_terminated_length": 442.62109375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.1696,
"grad_norm": 0.007485765963792801,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0209,
"num_tokens": 35360054.0,
"reward": 1.729485273361206,
"reward_std": 0.40719637274742126,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6875070333480835,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9335594773292542,
"step": 159
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6138448969331322,
"calib/avg_num_step_conf": 3.7265625,
"calib/ece": 0.25837944664031615,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.22924901185770752,
"calib/gap": 0.07113122171945707,
"calib/mean_conf": 0.7959288537549407,
"calib/mu_c": 0.8288235294117647,
"calib/mu_w": 0.7576923076923077,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25837944664031615,
"calib/std_conf": 0.1656500754825719,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2996.0,
"completions/max_terminated_length": 2996.0,
"completions/mean_length": 452.23046875,
"completions/mean_terminated_length": 454.0039367675781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.008462823927402496,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0429,
"num_tokens": 35580665.0,
"reward": 1.6944373846054077,
"reward_std": 0.4696410298347473,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6844590306282043,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9292279481887817,
"step": 160
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6363798898662661,
"calib/avg_num_step_conf": 3.90625,
"calib/ece": 0.11937500000000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.1875,
"calib/gap": 0.08837874561968084,
"calib/mean_conf": 0.76515625,
"calib/mu_c": 0.7924293785310734,
"calib/mu_w": 0.7040506329113926,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09656250000000004,
"calib/std_conf": 0.17892467154067238,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1355.0,
"completions/max_terminated_length": 1355.0,
"completions/mean_length": 437.55078125,
"completions/mean_terminated_length": 439.2666931152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.007638855371624231,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0357,
"num_tokens": 35796598.0,
"reward": 1.9639207124710083,
"reward_std": 0.4240199625492096,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7832546830177307,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9318031072616577,
"step": 161
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.576569264069264,
"calib/avg_num_step_conf": 3.71484375,
"calib/ece": 0.17574218750000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.28125,
"calib/gap": 0.04588203463203455,
"calib/mean_conf": 0.7623828124999998,
"calib/mu_c": 0.7781547619047618,
"calib/mu_w": 0.7322727272727273,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14093750000000005,
"calib/std_conf": 0.20121939290135493,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1058.0,
"completions/max_terminated_length": 1058.0,
"completions/mean_length": 425.61328125,
"completions/mean_terminated_length": 427.2823791503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1728,
"grad_norm": 0.008791333064436913,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0266,
"num_tokens": 36009699.0,
"reward": 1.901847243309021,
"reward_std": 0.41961348056793213,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7394648790359497,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9382367134094238,
"step": 162
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6198596404826399,
"calib/avg_num_step_conf": 3.671875,
"calib/ece": 0.2371764705882352,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.23921568627450981,
"calib/gap": 0.08601822211278021,
"calib/mean_conf": 0.7487058823529411,
"calib/mu_c": 0.7905343511450382,
"calib/mu_w": 0.704516129032258,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23607843137254897,
"calib/std_conf": 0.20200827920498382,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1266.0,
"completions/max_terminated_length": 1266.0,
"completions/mean_length": 462.83984375,
"completions/mean_terminated_length": 462.83984375,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.0074338410049676895,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0123,
"num_tokens": 36233018.0,
"reward": 1.6685981750488281,
"reward_std": 0.4071301817893982,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6907016038894653,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9290037155151367,
"step": 163
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5939094189374144,
"calib/avg_num_step_conf": 3.66796875,
"calib/ece": 0.1917254901960785,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.23137254901960785,
"calib/gap": 0.07571855169839492,
"calib/mean_conf": 0.7327450980392157,
"calib/mu_c": 0.7665957446808511,
"calib/mu_w": 0.6908771929824562,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18576470588235305,
"calib/std_conf": 0.2040019072514894,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1710.0,
"completions/max_terminated_length": 1710.0,
"completions/mean_length": 476.5,
"completions/mean_terminated_length": 478.36865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.0079426234588027,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0127,
"num_tokens": 36461138.0,
"reward": 1.7319530248641968,
"reward_std": 0.4190503656864166,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7105636596679688,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9281857013702393,
"step": 164
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5508837209302326,
"calib/avg_num_step_conf": 3.8359375,
"calib/ece": 0.275748031496063,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2283464566929134,
"calib/gap": 0.0350337984496123,
"calib/mean_conf": 0.7620472440944882,
"calib/mu_c": 0.77984,
"calib/mu_w": 0.7448062015503877,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.27283464566929133,
"calib/std_conf": 0.18649201955726338,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2592.0,
"completions/max_terminated_length": 2592.0,
"completions/mean_length": 461.8046875,
"completions/mean_terminated_length": 461.8046875,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.176,
"grad_norm": 0.007690636441111565,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0221,
"num_tokens": 36684936.0,
"reward": 1.6147657632827759,
"reward_std": 0.4274684488773346,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6454948782920837,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9229432344436646,
"step": 165
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6298540249433107,
"calib/avg_num_step_conf": 3.71484375,
"calib/ece": 0.11531746031746046,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2777777777777778,
"calib/gap": 0.10089285714285712,
"calib/mean_conf": 0.7366666666666666,
"calib/mu_c": 0.7702976190476191,
"calib/mu_w": 0.669404761904762,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09265873015873027,
"calib/std_conf": 0.20488479147505745,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1129.0,
"completions/max_terminated_length": 1129.0,
"completions/mean_length": 473.66796875,
"completions/mean_terminated_length": 477.39764404296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.007974786683917046,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0102,
"num_tokens": 36912379.0,
"reward": 1.8942294120788574,
"reward_std": 0.4324895143508911,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7597531080245972,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9187272787094116,
"step": 166
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5789037645811241,
"calib/avg_num_step_conf": 3.7421875,
"calib/ece": 0.17070312499999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.30078125,
"calib/gap": 0.06559384941675517,
"calib/mean_conf": 0.742890625,
"calib/mu_c": 0.7664634146341465,
"calib/mu_w": 0.7008695652173913,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.136484375,
"calib/std_conf": 0.2149205970518167,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 993.0,
"completions/max_terminated_length": 993.0,
"completions/mean_length": 446.140625,
"completions/mean_terminated_length": 447.8902282714844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.008287301287055016,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0091,
"num_tokens": 37132199.0,
"reward": 1.8784657716751099,
"reward_std": 0.394712895154953,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7408288717269897,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9370966553688049,
"step": 167
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.599137104506232,
"calib/avg_num_step_conf": 3.49609375,
"calib/ece": 0.1996456692913386,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.32677165354330706,
"calib/gap": 0.06027165228507514,
"calib/mean_conf": 0.7601181102362204,
"calib/mu_c": 0.7850335570469799,
"calib/mu_w": 0.7247619047619047,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18657480314960628,
"calib/std_conf": 0.2056820462758194,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2449.0,
"completions/max_terminated_length": 2449.0,
"completions/mean_length": 473.3046875,
"completions/mean_terminated_length": 473.3046875,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.1792,
"grad_norm": 0.007830573245882988,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0542,
"num_tokens": 37358037.0,
"reward": 1.774470329284668,
"reward_std": 0.5710139274597168,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7054336071014404,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9315104484558105,
"step": 168
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5640266699900299,
"calib/avg_num_step_conf": 3.59765625,
"calib/ece": 0.2568503937007873,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.33858267716535434,
"calib/gap": 0.06924476570289129,
"calib/mean_conf": 0.7494488188976378,
"calib/mu_c": 0.7816176470588235,
"calib/mu_w": 0.7123728813559322,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23543307086614168,
"calib/std_conf": 0.23326274289524773,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1054.0,
"completions/max_terminated_length": 1054.0,
"completions/mean_length": 449.08984375,
"completions/mean_terminated_length": 452.6259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.008146989159286022,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0008,
"num_tokens": 37577188.0,
"reward": 1.6961153745651245,
"reward_std": 0.40185391902923584,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6801343560218811,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9324523210525513,
"step": 169
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7114549045424621,
"calib/avg_num_step_conf": 3.69921875,
"calib/ece": 0.17636363636363628,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.37549407114624506,
"calib/gap": 0.13707899934167211,
"calib/mean_conf": 0.7824505928853754,
"calib/mu_c": 0.8355483870967741,
"calib/mu_w": 0.698469387755102,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1730830039525691,
"calib/std_conf": 0.20751761325704726,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1740.0,
"completions/max_terminated_length": 1740.0,
"completions/mean_length": 450.67578125,
"completions/mean_terminated_length": 452.44317626953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.007818641141057014,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0039,
"num_tokens": 37796713.0,
"reward": 1.8228336572647095,
"reward_std": 0.3818396329879761,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7469961047172546,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9349637031555176,
"step": 170
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5743734335839599,
"calib/avg_num_step_conf": 3.66796875,
"calib/ece": 0.2789370078740158,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.1968503937007874,
"calib/gap": 0.055676691729323324,
"calib/mean_conf": 0.6964173228346456,
"calib/mu_c": 0.7271052631578947,
"calib/mu_w": 0.6714285714285714,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2632677165354331,
"calib/std_conf": 0.23475099694370016,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2675.0,
"completions/max_terminated_length": 2675.0,
"completions/mean_length": 430.09765625,
"completions/mean_terminated_length": 431.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.1824,
"grad_norm": 0.00880393385887146,
"learning_rate": 8.055555555555557e-07,
"loss": 0.0378,
"num_tokens": 38013714.0,
"reward": 1.5643079280853271,
"reward_std": 0.48412132263183594,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.6585675477981567,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9424140453338623,
"step": 171
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5357384594558007,
"calib/avg_num_step_conf": 3.58984375,
"calib/ece": 0.21105882352941183,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3411764705882353,
"calib/gap": 0.026694192500338554,
"calib/mean_conf": 0.7216470588235294,
"calib/mu_c": 0.7309638554216868,
"calib/mu_w": 0.7042696629213483,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14086274509803925,
"calib/std_conf": 0.2421286988464502,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2075.0,
"completions/max_terminated_length": 2075.0,
"completions/mean_length": 431.54296875,
"completions/mean_terminated_length": 431.54296875,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.008577347733080387,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0029,
"num_tokens": 38227541.0,
"reward": 1.8829433917999268,
"reward_std": 0.4061717987060547,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7155578136444092,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9412156343460083,
"step": 172
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5186795186795187,
"calib/avg_num_step_conf": 3.8984375,
"calib/ece": 0.28606299212598435,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": 0.017394317394317405,
"calib/mean_conf": 0.7839370078740158,
"calib/mu_c": 0.7915384615384616,
"calib/mu_w": 0.7741441441441442,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2535039370078741,
"calib/std_conf": 0.22368480875952962,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1097.0,
"completions/max_terminated_length": 1097.0,
"completions/mean_length": 447.54296875,
"completions/mean_terminated_length": 449.2980651855469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.00868956372141838,
"learning_rate": 7.5e-07,
"loss": 0.0118,
"num_tokens": 38445272.0,
"reward": 1.731661319732666,
"reward_std": 0.4513019323348999,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6584905982017517,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9322172403335571,
"step": 173
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5523114355231143,
"calib/avg_num_step_conf": 3.859375,
"calib/ece": 0.32207171314741045,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3426294820717131,
"calib/gap": 0.030355999487770458,
"calib/mean_conf": 0.7372908366533865,
"calib/mu_c": 0.753859649122807,
"calib/mu_w": 0.7235036496350365,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30258964143426303,
"calib/std_conf": 0.23218139090497097,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2521.0,
"completions/max_terminated_length": 2521.0,
"completions/mean_length": 497.0078125,
"completions/mean_terminated_length": 497.0078125,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.1856,
"grad_norm": 0.008361944928765297,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0872,
"num_tokens": 38676738.0,
"reward": 1.5412625074386597,
"reward_std": 0.43886956572532654,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.6203457117080688,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.9197043180465698,
"step": 174
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5983745983745984,
"calib/avg_num_step_conf": 3.56640625,
"calib/ece": 0.25137795275590563,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2559055118110236,
"calib/gap": 0.08235179235179191,
"calib/mean_conf": 0.6857086614173229,
"calib/mu_c": 0.7320720720720717,
"calib/mu_w": 0.6497202797202798,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2500393700787403,
"calib/std_conf": 0.24757582502655276,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1409.0,
"completions/max_terminated_length": 1409.0,
"completions/mean_length": 449.890625,
"completions/mean_terminated_length": 451.6549377441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.008198156021535397,
"learning_rate": 6.944444444444446e-07,
"loss": 0.0131,
"num_tokens": 38897734.0,
"reward": 1.543121337890625,
"reward_std": 0.4899318814277649,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6622323989868164,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.932127833366394,
"step": 175
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5620226656812022,
"calib/avg_num_step_conf": 3.72265625,
"calib/ece": 0.2756470588235293,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.35294117647058826,
"calib/gap": 0.05977087952697713,
"calib/mean_conf": 0.7579215686274509,
"calib/mu_c": 0.7888617886178861,
"calib/mu_w": 0.729090909090909,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2756078431372548,
"calib/std_conf": 0.22568239379297542,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1164.0,
"completions/max_terminated_length": 1164.0,
"completions/mean_length": 443.9765625,
"completions/mean_terminated_length": 445.7176818847656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.008800878189504147,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0109,
"num_tokens": 39115456.0,
"reward": 1.615879774093628,
"reward_std": 0.3879082202911377,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6507371068000793,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9377819299697876,
"step": 176
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6370708661417323,
"calib/avg_num_step_conf": 3.5546875,
"calib/ece": 0.21146825396825397,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.29365079365079366,
"calib/gap": 0.13484031496062976,
"calib/mean_conf": 0.6894047619047619,
"calib/mu_c": 0.7573599999999998,
"calib/mu_w": 0.62251968503937,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20242063492063492,
"calib/std_conf": 0.25798679302272365,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 453.82421875,
"completions/mean_terminated_length": 455.60394287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.1888,
"grad_norm": 0.00836542621254921,
"learning_rate": 6.388888888888889e-07,
"loss": 0.0377,
"num_tokens": 39335467.0,
"reward": 1.6355714797973633,
"reward_std": 0.3930942416191101,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6990835666656494,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9291399717330933,
"step": 177
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5905233380480905,
"calib/avg_num_step_conf": 3.5546875,
"calib/ece": 0.1696078431372549,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.3137254901960784,
"calib/gap": 0.0794419441944193,
"calib/mean_conf": 0.7003529411764705,
"calib/mu_c": 0.7318181818181817,
"calib/mu_w": 0.6523762376237624,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13301960784313727,
"calib/std_conf": 0.24759920361060206,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 914.0,
"completions/max_terminated_length": 914.0,
"completions/mean_length": 437.08984375,
"completions/mean_terminated_length": 438.803955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.008996784687042236,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0402,
"num_tokens": 39553434.0,
"reward": 1.8174397945404053,
"reward_std": 0.5554205179214478,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.725355863571167,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9428409337997437,
"step": 178
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5589751552795033,
"calib/avg_num_step_conf": 3.5078125,
"calib/ece": 0.20407843137254905,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.25098039215686274,
"calib/gap": 0.05414285714285749,
"calib/mean_conf": 0.6977254901960785,
"calib/mu_c": 0.7221428571428573,
"calib/mu_w": 0.6679999999999998,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17639215686274512,
"calib/std_conf": 0.2417929821417383,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2778.0,
"completions/max_terminated_length": 2778.0,
"completions/mean_length": 447.703125,
"completions/mean_terminated_length": 447.703125,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.008592971600592136,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0348,
"num_tokens": 39774310.0,
"reward": 1.7282838821411133,
"reward_std": 0.45130395889282227,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6959078311920166,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.943790078163147,
"step": 179
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5888141025641026,
"calib/avg_num_step_conf": 3.7265625,
"calib/ece": 0.23724409448818884,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.452755905511811,
"calib/gap": 0.06779230769230771,
"calib/mean_conf": 0.7698425196850395,
"calib/mu_c": 0.7976000000000001,
"calib/mu_w": 0.7298076923076924,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20826771653543297,
"calib/std_conf": 0.22764450892574783,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1691.0,
"completions/max_terminated_length": 1691.0,
"completions/mean_length": 479.984375,
"completions/mean_terminated_length": 483.7637634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.192,
"grad_norm": 0.007479749154299498,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0021,
"num_tokens": 40001042.0,
"reward": 1.7830191850662231,
"reward_std": 0.4244670569896698,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.701492965221405,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9305838346481323,
"step": 180
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5667370906605357,
"calib/avg_num_step_conf": 3.58203125,
"calib/ece": 0.32913385826771646,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": 0.06023488473249239,
"calib/mean_conf": 0.7860629921259844,
"calib/mu_c": 0.8176033057851239,
"calib/mu_w": 0.7573684210526315,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.31940944881889755,
"calib/std_conf": 0.22379214723901802,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2660.0,
"completions/max_terminated_length": 2660.0,
"completions/mean_length": 427.046875,
"completions/mean_terminated_length": 427.046875,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.00919902604073286,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0385,
"num_tokens": 40216630.0,
"reward": 1.590777039527893,
"reward_std": 0.5281400084495544,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6234515905380249,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9349694848060608,
"step": 181
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6672238652436671,
"calib/avg_num_step_conf": 3.5,
"calib/ece": 0.2038039215686274,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.43137254901960786,
"calib/gap": 0.1320027002700268,
"calib/mean_conf": 0.7763529411764705,
"calib/mu_c": 0.8286363636363636,
"calib/mu_w": 0.6966336633663368,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18811764705882347,
"calib/std_conf": 0.22953925065092615,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2196.0,
"completions/max_terminated_length": 2196.0,
"completions/mean_length": 438.4140625,
"completions/mean_terminated_length": 438.4140625,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.008267982862889767,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0387,
"num_tokens": 40435024.0,
"reward": 1.8188896179199219,
"reward_std": 0.39455264806747437,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7382515668869019,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.94355708360672,
"step": 182
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6263074984247007,
"calib/avg_num_step_conf": 3.4921875,
"calib/ece": 0.23162055335968376,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.37549407114624506,
"calib/gap": 0.11056521739130432,
"calib/mean_conf": 0.7377865612648221,
"calib/mu_c": 0.7880434782608695,
"calib/mu_w": 0.6774782608695652,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21197628458498022,
"calib/std_conf": 0.2486545651779337,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1546.0,
"completions/max_terminated_length": 1546.0,
"completions/mean_length": 482.29296875,
"completions/mean_terminated_length": 486.0905456542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.1952,
"grad_norm": 0.007671494036912918,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0029,
"num_tokens": 40665171.0,
"reward": 1.7111189365386963,
"reward_std": 0.5463021397590637,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6997734308242798,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.933765172958374,
"step": 183
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5814993564993564,
"calib/avg_num_step_conf": 3.4453125,
"calib/ece": 0.24713147410358563,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.41434262948207173,
"calib/gap": 0.08322200772200794,
"calib/mean_conf": 0.7615537848605577,
"calib/mu_c": 0.7983571428571429,
"calib/mu_w": 0.7151351351351349,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22545816733067728,
"calib/std_conf": 0.24086409869355724,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 471.40625,
"completions/mean_terminated_length": 476.9960632324219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.007687829434871674,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0316,
"num_tokens": 40891131.0,
"reward": 1.7092678546905518,
"reward_std": 0.580082893371582,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6801937818527222,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.9225028157234192,
"step": 184
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5625609994144056,
"calib/avg_num_step_conf": 3.38671875,
"calib/ece": 0.24247999999999992,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.364,
"calib/gap": 0.05972151734010023,
"calib/mean_conf": 0.75616,
"calib/mu_c": 0.7821985815602838,
"calib/mu_w": 0.7224770642201835,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.21731999999999993,
"calib/std_conf": 0.23194924962154972,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2094.0,
"completions/max_terminated_length": 2094.0,
"completions/mean_length": 447.8515625,
"completions/mean_terminated_length": 456.7729187011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.008878587745130062,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0212,
"num_tokens": 41112701.0,
"reward": 1.704228162765503,
"reward_std": 0.4695289134979248,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6678878664970398,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.9146498441696167,
"step": 185
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6606127996046454,
"calib/avg_num_step_conf": 3.546875,
"calib/ece": 0.2214453125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.33984375,
"calib/gap": 0.12739436619718314,
"calib/mean_conf": 0.7456640625,
"calib/mu_c": 0.8023943661971832,
"calib/mu_w": 0.675,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2062109375,
"calib/std_conf": 0.2433866451163582,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 875.0,
"completions/max_terminated_length": 875.0,
"completions/mean_length": 447.9453125,
"completions/mean_terminated_length": 451.4724426269531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.1984,
"grad_norm": 0.008053626865148544,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0144,
"num_tokens": 41332415.0,
"reward": 1.747734785079956,
"reward_std": 0.39273902773857117,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7202167510986328,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.9425970315933228,
"step": 186
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4550224887556222,
"calib/avg_num_step_conf": 4.1953125,
"calib/ece": 0.31059055118110224,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4251968503937008,
"calib/gap": -0.0163343328335831,
"calib/mean_conf": 0.7876771653543306,
"calib/mu_c": 0.7802173913043479,
"calib/mu_w": 0.796551724137931,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2774803149606298,
"calib/std_conf": 0.21472862866131273,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2410.0,
"completions/max_terminated_length": 2410.0,
"completions/mean_length": 499.9609375,
"completions/mean_terminated_length": 499.9609375,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.00757110770791769,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0674,
"num_tokens": 41561949.0,
"reward": 1.6947870254516602,
"reward_std": 0.4970128536224365,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6329605579376221,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9274370670318604,
"step": 187
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.48736961166268683,
"calib/avg_num_step_conf": 3.578125,
"calib/ece": 0.31290196078431354,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.45098039215686275,
"calib/gap": -0.00617255246952364,
"calib/mean_conf": 0.8032549019607843,
"calib/mu_c": 0.8006164383561644,
"calib/mu_w": 0.806788990825688,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2718039215686272,
"calib/std_conf": 0.21318866201847056,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1146.0,
"completions/max_terminated_length": 1146.0,
"completions/mean_length": 461.875,
"completions/mean_terminated_length": 463.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.007569571956992149,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0095,
"num_tokens": 41784261.0,
"reward": 1.7472176551818848,
"reward_std": 0.42943885922431946,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6471472978591919,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9354736804962158,
"step": 188
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5716177861873227,
"calib/avg_num_step_conf": 3.4140625,
"calib/ece": 0.25820312499999987,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.43359375,
"calib/gap": 0.0431081677704197,
"calib/mean_conf": 0.7660937499999999,
"calib/mu_c": 0.7837748344370863,
"calib/mu_w": 0.7406666666666666,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21722656249999991,
"calib/std_conf": 0.24760803947153553,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1027.0,
"completions/max_terminated_length": 1027.0,
"completions/mean_length": 428.27734375,
"completions/mean_terminated_length": 429.9568786621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.2016,
"grad_norm": 0.0090395612642169,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0144,
"num_tokens": 42001668.0,
"reward": 1.7883484363555908,
"reward_std": 0.40974628925323486,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6837632656097412,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9461929798126221,
"step": 189
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6864989989989991,
"calib/avg_num_step_conf": 3.265625,
"calib/ece": 0.2174117647058823,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.40784313725490196,
"calib/gap": 0.1463269519519521,
"calib/mean_conf": 0.776235294117647,
"calib/mu_c": 0.8399305555555555,
"calib/mu_w": 0.6936036036036034,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.21447058823529408,
"calib/std_conf": 0.21931610173403926,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 935.0,
"completions/max_terminated_length": 935.0,
"completions/mean_length": 469.2265625,
"completions/mean_terminated_length": 471.0666809082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.007644584868103266,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.003,
"num_tokens": 42227398.0,
"reward": 1.7453137636184692,
"reward_std": 0.4353315234184265,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7184046506881714,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.9269128441810608,
"step": 190
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5794139374538291,
"calib/avg_num_step_conf": 3.53515625,
"calib/ece": 0.3589411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.48627450980392156,
"calib/gap": 0.03948227037675456,
"calib/mean_conf": 0.8143137254901961,
"calib/mu_c": 0.8345967741935484,
"calib/mu_w": 0.7951145038167938,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34349019607843145,
"calib/std_conf": 0.2008282350858299,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2628.0,
"completions/max_terminated_length": 2628.0,
"completions/mean_length": 443.63671875,
"completions/mean_terminated_length": 443.63671875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.008346221409738064,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0273,
"num_tokens": 42445137.0,
"reward": 1.611642837524414,
"reward_std": 0.420346736907959,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6173906326293945,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9385555982589722,
"step": 191
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6621045477014336,
"calib/avg_num_step_conf": 3.42578125,
"calib/ece": 0.23835294117647055,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.42745098039215684,
"calib/gap": 0.1552205882352944,
"calib/mean_conf": 0.7657254901960784,
"calib/mu_c": 0.8381617647058824,
"calib/mu_w": 0.682941176470588,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2353725490196078,
"calib/std_conf": 0.24121458012312477,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 482.12109375,
"completions/mean_terminated_length": 482.12109375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.2048,
"grad_norm": 0.00783407874405384,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.026,
"num_tokens": 42673536.0,
"reward": 1.7091728448867798,
"reward_std": 0.4516070485115051,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7133882641792297,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.9436155557632446,
"step": 192
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5753201181974883,
"calib/avg_num_step_conf": 3.421875,
"calib/ece": 0.28921568627450966,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4196078431372549,
"calib/gap": 0.052146638758926445,
"calib/mean_conf": 0.7609019607843136,
"calib/mu_c": 0.7862595419847328,
"calib/mu_w": 0.7341129032258064,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.26819607843137244,
"calib/std_conf": 0.24440959312564142,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1641.0,
"completions/max_terminated_length": 1641.0,
"completions/mean_length": 477.22265625,
"completions/mean_terminated_length": 479.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.008240696042776108,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.039,
"num_tokens": 42901417.0,
"reward": 1.6538842916488647,
"reward_std": 0.5406385660171509,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6447539329528809,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.931720495223999,
"step": 193
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.64476976976977,
"calib/avg_num_step_conf": 3.3515625,
"calib/ece": 0.22789062499999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.49609375,
"calib/gap": 0.14399399399399393,
"calib/mean_conf": 0.7854687499999998,
"calib/mu_c": 0.8462162162162162,
"calib/mu_w": 0.7022222222222223,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21761718749999995,
"calib/std_conf": 0.24354593821995366,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 923.0,
"completions/max_terminated_length": 923.0,
"completions/mean_length": 417.890625,
"completions/mean_terminated_length": 419.5294494628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.008800129406154156,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0017,
"num_tokens": 43114341.0,
"reward": 1.7753586769104004,
"reward_std": 0.5017086863517761,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7127922177314758,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9433302283287048,
"step": 194
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6631269618646898,
"calib/avg_num_step_conf": 3.359375,
"calib/ece": 0.18673228346456677,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.452755905511811,
"calib/gap": 0.13623388766446276,
"calib/mean_conf": 0.7712992125984252,
"calib/mu_c": 0.8211801242236025,
"calib/mu_w": 0.6849462365591398,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16208661417322823,
"calib/std_conf": 0.2503483499018769,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2161.0,
"completions/max_terminated_length": 2161.0,
"completions/mean_length": 442.984375,
"completions/mean_terminated_length": 442.984375,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.208,
"grad_norm": 0.00870486069470644,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0373,
"num_tokens": 43333729.0,
"reward": 1.8612749576568604,
"reward_std": 0.4175664186477661,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7437324523925781,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9435549378395081,
"step": 195
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.557908496732026,
"calib/avg_num_step_conf": 3.2265625,
"calib/ece": 0.21549407114624491,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.383399209486166,
"calib/gap": 0.06189019607843138,
"calib/mean_conf": 0.8010276679841897,
"calib/mu_c": 0.8254901960784313,
"calib/mu_w": 0.7636,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20588932806324095,
"calib/std_conf": 0.19607948705152148,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1168.0,
"completions/max_terminated_length": 1168.0,
"completions/mean_length": 402.36328125,
"completions/mean_terminated_length": 405.531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.008162040263414383,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0011,
"num_tokens": 43539278.0,
"reward": 1.802194595336914,
"reward_std": 0.2759098410606384,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7052210569381714,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.941057026386261,
"step": 196
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6223738495715645,
"calib/avg_num_step_conf": 3.4453125,
"calib/ece": 0.33499999999999996,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.43253968253968256,
"calib/gap": 0.1003122818152965,
"calib/mean_conf": 0.7846825396825396,
"calib/mu_c": 0.8392173913043476,
"calib/mu_w": 0.7389051094890511,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.33166666666666667,
"calib/std_conf": 0.21787460536600237,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2516.0,
"completions/max_terminated_length": 2516.0,
"completions/mean_length": 431.19140625,
"completions/mean_terminated_length": 436.3043518066406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.008852004073560238,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0215,
"num_tokens": 43754719.0,
"reward": 1.5589957237243652,
"reward_std": 0.557856559753418,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6363070011138916,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.9356131553649902,
"step": 197
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5163560666137985,
"calib/avg_num_step_conf": 3.64453125,
"calib/ece": 0.22762845849802363,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4031620553359684,
"calib/gap": 0.04762688342585242,
"calib/mean_conf": 0.7846245059288537,
"calib/mu_c": 0.8028846153846153,
"calib/mu_w": 0.7552577319587629,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19782608695652165,
"calib/std_conf": 0.2153707101689293,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2137.0,
"completions/max_terminated_length": 2137.0,
"completions/mean_length": 427.171875,
"completions/mean_terminated_length": 430.5354309082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.2112,
"grad_norm": 0.008308351039886475,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0106,
"num_tokens": 43969459.0,
"reward": 1.8177075386047363,
"reward_std": 0.4939545691013336,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7031598091125488,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.9348583221435547,
"step": 198
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6144176180672531,
"calib/avg_num_step_conf": 3.52734375,
"calib/ece": 0.2922440944881891,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.484251968503937,
"calib/gap": 0.08612826751512892,
"calib/mean_conf": 0.8039763779527558,
"calib/mu_c": 0.8436496350364964,
"calib/mu_w": 0.7575213675213675,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2784251968503938,
"calib/std_conf": 0.21471170799990183,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2095.0,
"completions/max_terminated_length": 2095.0,
"completions/mean_length": 482.859375,
"completions/mean_terminated_length": 484.7529602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.008521615527570248,
"learning_rate": 2.777777777777778e-08,
"loss": 0.0147,
"num_tokens": 44197271.0,
"reward": 1.7006497383117676,
"reward_std": 0.5969787836074829,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6729308366775513,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9343558549880981,
"step": 199
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6254920634920635,
"calib/avg_num_step_conf": 3.41015625,
"calib/ece": 0.2550980392156863,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4823529411764706,
"calib/gap": 0.07900952380952386,
"calib/mean_conf": 0.7966666666666665,
"calib/mu_c": 0.8292,
"calib/mu_w": 0.7501904761904762,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23176470588235298,
"calib/std_conf": 0.22270763976670738,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1165.0,
"completions/max_terminated_length": 1165.0,
"completions/mean_length": 448.09765625,
"completions/mean_terminated_length": 449.85491943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.008153123781085014,
"learning_rate": 0.0,
"loss": 0.0175,
"num_tokens": 44420032.0,
"reward": 1.78486168384552,
"reward_std": 0.38065484166145325,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6998906135559082,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.9395561218261719,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.03268525441642851,
"train_runtime": 11125.5146,
"train_samples_per_second": 4.602,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 44420032,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}