Files
PureRL-1.5B-v7-s2-l1-maskoff/trainer_state.json
ModelHub XC 3bbb410953 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l1-maskoff
Source: Original Platform
2026-06-04 16:46:27 +08:00

9843 lines
385 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.0069753071293234825,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0124,
"num_tokens": 229171.0,
"reward": 0.326894611120224,
"reward_std": 0.18101200461387634,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.39095962047576904,
"step": 1
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.007095666602253914,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0052,
"num_tokens": 458661.0,
"reward": 0.256092369556427,
"reward_std": 0.19392633438110352,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.4261414408683777,
"step": 2
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5330286663896967,
"calib/avg_num_step_conf": 5.046875,
"calib/ece": 0.22395256916996056,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2648221343873518,
"calib/gap": 0.004881595346905021,
"calib/mean_conf": 0.87699604743083,
"calib/mu_c": 0.8786746987951808,
"calib/mu_w": 0.8737931034482758,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22241106719367598,
"calib/std_conf": 0.04800020569979932,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.776433203631647,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.0018650654358696173,
"calib/step_q_w": 0.7745681381957774,
"calib/step_q_w_n": 521.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1968.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 483.37109375,
"completions/mean_terminated_length": 487.1771545410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.00795243214815855,
"kl": 0.00039833784103393555,
"learning_rate": 7.5e-07,
"loss": 0.0129,
"num_tokens": 687660.0,
"reward": 0.32164907455444336,
"reward_std": 0.16832180321216583,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.715328574180603,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.39859285950660706,
"step": 3
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.479371387283237,
"calib/avg_num_step_conf": 5.125,
"calib/ece": 0.18932806324110682,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.22529644268774704,
"calib/gap": -0.0005520231213871352,
"calib/mean_conf": 0.8731225296442688,
"calib/mu_c": 0.8729479768786128,
"calib/mu_w": 0.8734999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18932806324110682,
"calib/std_conf": 0.047541668911902576,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7937211981566821,
"calib/step_q_c_n": 868.0,
"calib/step_q_gap": 0.010252729688213558,
"calib/step_q_w": 0.7834684684684685,
"calib/step_q_w_n": 444.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2575.0,
"completions/max_terminated_length": 2575.0,
"completions/mean_length": 523.90625,
"completions/mean_terminated_length": 523.90625,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.007437299005687237,
"kl": 0.00030091404914855957,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0375,
"num_tokens": 927948.0,
"reward": 0.32690131664276123,
"reward_std": 0.157812237739563,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7328823804855347,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.411111056804657,
"step": 4
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.43493761140819964,
"calib/avg_num_step_conf": 4.80859375,
"calib/ece": 0.35247011952191243,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.2908366533864542,
"calib/gap": -0.007580213903743083,
"calib/mean_conf": 0.8783665338645419,
"calib/mu_c": 0.8747727272727274,
"calib/mu_w": 0.8823529411764705,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.35247011952191243,
"calib/std_conf": 0.0467523047937078,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7908806818181818,
"calib/step_q_c_n": 704.0,
"calib/step_q_gap": 0.0046757482318440236,
"calib/step_q_w": 0.7862049335863378,
"calib/step_q_w_n": 527.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2852.0,
"completions/max_terminated_length": 2852.0,
"completions/mean_length": 521.390625,
"completions/mean_terminated_length": 521.390625,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.007108544930815697,
"kl": 0.00028464198112487793,
"learning_rate": 1.25e-06,
"loss": -0.0318,
"num_tokens": 1168112.0,
"reward": 0.2227281630039215,
"reward_std": 0.178195059299469,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6066081523895264,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.45880815386772156,
"step": 5
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.47032474804031354,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.3324705882352943,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3254901960784314,
"calib/gap": -0.0035386338185892097,
"calib/mean_conf": 0.8854117647058822,
"calib/mu_c": 0.8838297872340424,
"calib/mu_w": 0.8873684210526316,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3324705882352943,
"calib/std_conf": 0.03807792653287041,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7958333333333333,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.001792771710868335,
"calib/step_q_w": 0.794040561622465,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1633.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 438.1015625,
"completions/mean_terminated_length": 438.1015625,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.0064,
"grad_norm": 0.0097042853012681,
"kl": 0.0008447170257568359,
"learning_rate": 1.5e-06,
"loss": -0.0065,
"num_tokens": 1386218.0,
"reward": 0.24446246027946472,
"reward_std": 0.16572800278663635,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6327519416809082,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.45242077112197876,
"step": 6
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.48979800040807997,
"calib/avg_num_step_conf": 5.30078125,
"calib/ece": 0.21792968750000008,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.30859375,
"calib/gap": -0.0032122696048426658,
"calib/mean_conf": 0.8773046875,
"calib/mu_c": 0.8762130177514792,
"calib/mu_w": 0.8794252873563219,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2175390625000001,
"calib/std_conf": 0.05623056033445998,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7863606911447084,
"calib/step_q_c_n": 926.0,
"calib/step_q_gap": -0.01681796314763484,
"calib/step_q_w": 0.8031786542923433,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1580.0,
"completions/max_terminated_length": 1580.0,
"completions/mean_length": 522.12890625,
"completions/mean_terminated_length": 524.176513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.006987900473177433,
"kl": 0.00025841593742370605,
"learning_rate": 1.75e-06,
"loss": 0.0076,
"num_tokens": 1627307.0,
"reward": 0.3210902214050293,
"reward_std": 0.1542452871799469,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7219562530517578,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.4110258221626282,
"step": 7
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.44363992172211353,
"calib/avg_num_step_conf": 4.79296875,
"calib/ece": 0.2905179282868527,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.23904382470119523,
"calib/gap": 0.00294846705805607,
"calib/mean_conf": 0.872191235059761,
"calib/mu_c": 0.8734246575342466,
"calib/mu_w": 0.8704761904761905,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2905179282868527,
"calib/std_conf": 0.07471845750082982,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.796625386996904,
"calib/step_q_c_n": 646.0,
"calib/step_q_gap": 0.03562710816729986,
"calib/step_q_w": 0.7609982788296041,
"calib/step_q_w_n": 581.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2913.0,
"completions/max_terminated_length": 2913.0,
"completions/mean_length": 529.171875,
"completions/mean_terminated_length": 529.171875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.007049232255667448,
"kl": 0.00041344761848449707,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0138,
"num_tokens": 1869287.0,
"reward": 0.28460854291915894,
"reward_std": 0.1505751758813858,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6487230062484741,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3873184621334076,
"step": 8
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5248116663211003,
"calib/avg_num_step_conf": 4.74609375,
"calib/ece": 0.24740000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.236,
"calib/gap": 0.0012861980786510463,
"calib/mean_conf": 0.8788400000000001,
"calib/mu_c": 0.879308176100629,
"calib/mu_w": 0.8780219780219779,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24512000000000003,
"calib/std_conf": 0.0420173107183218,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.778821243523316,
"calib/step_q_c_n": 772.0,
"calib/step_q_gap": 0.010085351875460669,
"calib/step_q_w": 0.7687358916478554,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2856.0,
"completions/max_terminated_length": 2856.0,
"completions/mean_length": 494.9921875,
"completions/mean_terminated_length": 498.8897705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.0096,
"grad_norm": 0.0068976497277617455,
"kl": 0.0003219097852706909,
"learning_rate": 2.25e-06,
"loss": 0.022,
"num_tokens": 2103541.0,
"reward": 0.2907140254974365,
"reward_std": 0.18676680326461792,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.689523458480835,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.4260641038417816,
"step": 9
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4660277636271014,
"calib/avg_num_step_conf": 5.0078125,
"calib/ece": 0.2923921568627451,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.34901960784313724,
"calib/gap": -0.005308201732042828,
"calib/mean_conf": 0.8845490196078432,
"calib/mu_c": 0.882384105960265,
"calib/mu_w": 0.8876923076923078,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2923921568627451,
"calib/std_conf": 0.04479684588226738,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7939179632248938,
"calib/step_q_c_n": 707.0,
"calib/step_q_gap": 0.013187528442285212,
"calib/step_q_w": 0.7807304347826086,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 492.23828125,
"completions/mean_terminated_length": 492.23828125,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.006544630043208599,
"kl": 0.00033104419708251953,
"learning_rate": 2.5e-06,
"loss": 0.0179,
"num_tokens": 2336354.0,
"reward": 0.2788810431957245,
"reward_std": 0.18121352791786194,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6639195084571838,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.4217824339866638,
"step": 10
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4625872318428535,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.31301587301587314,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.23412698412698413,
"calib/gap": -0.022662186611527524,
"calib/mean_conf": 0.8665873015873017,
"calib/mu_c": 0.857054794520548,
"calib/mu_w": 0.8797169811320755,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30011904761904773,
"calib/std_conf": 0.0959800136140514,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7820560747663551,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.030459983561143433,
"calib/step_q_w": 0.7515960912052116,
"calib/step_q_w_n": 614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 514.359375,
"completions/mean_terminated_length": 516.3765258789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.006941006984561682,
"kl": 0.0005399882793426514,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0012,
"num_tokens": 2572510.0,
"reward": 0.2535046935081482,
"reward_std": 0.17274489998817444,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.643332839012146,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.4472609758377075,
"step": 11
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4752100840336134,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.2063385826771654,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2755905511811024,
"calib/gap": -0.003327731092436781,
"calib/mean_conf": 0.8756299212598426,
"calib/mu_c": 0.874529411764706,
"calib/mu_w": 0.8778571428571428,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2063385826771654,
"calib/std_conf": 0.05151443542808282,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7933858267716535,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.01854121982466983,
"calib/step_q_w": 0.7748446069469836,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2058.0,
"completions/max_terminated_length": 2058.0,
"completions/mean_length": 478.34765625,
"completions/mean_terminated_length": 478.34765625,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0128,
"grad_norm": 0.007310581859201193,
"kl": 0.003771156072616577,
"learning_rate": 3e-06,
"loss": 0.0223,
"num_tokens": 2799143.0,
"reward": 0.33672308921813965,
"reward_std": 0.16366788744926453,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7147250175476074,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3701850175857544,
"step": 12
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5062836021505376,
"calib/avg_num_step_conf": 4.53125,
"calib/ece": 0.23185770750988133,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.18972332015810275,
"calib/gap": 0.0052197580645162445,
"calib/mean_conf": 0.8642687747035573,
"calib/mu_c": 0.8661875,
"calib/mu_w": 0.8609677419354838,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23185770750988133,
"calib/std_conf": 0.05012197467318067,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7799859747545582,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": -0.00462252636401006,
"calib/step_q_w": 0.7846085011185683,
"calib/step_q_w_n": 447.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2917.0,
"completions/max_terminated_length": 2917.0,
"completions/mean_length": 476.8046875,
"completions/mean_terminated_length": 476.8046875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.007094182539731264,
"kl": 0.0008336901664733887,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0411,
"num_tokens": 3025797.0,
"reward": 0.3195507228374481,
"reward_std": 0.16193151473999023,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7047258019447327,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3874993324279785,
"step": 13
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.44118594908153397,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.3032539682539682,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.26587301587301587,
"calib/gap": -0.012412504028359472,
"calib/mean_conf": 0.877063492063492,
"calib/mu_c": 0.8717931034482759,
"calib/mu_w": 0.8842056074766353,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.30246031746031743,
"calib/std_conf": 0.046531563964134603,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7948476052249636,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.03369640247582262,
"calib/step_q_w": 0.7611512027491409,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 527.546875,
"completions/mean_terminated_length": 531.7008056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.008941327221691608,
"kl": 0.01397097110748291,
"learning_rate": 3.5e-06,
"loss": 0.0107,
"num_tokens": 3266249.0,
"reward": 0.2711637616157532,
"reward_std": 0.18728607892990112,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6459636688232422,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.4130111634731293,
"step": 14
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5223948220064725,
"calib/avg_num_step_conf": 4.3515625,
"calib/ece": 0.2808300395256915,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.25691699604743085,
"calib/gap": 0.0034103559870551914,
"calib/mean_conf": 0.8734782608695653,
"calib/mu_c": 0.8748666666666668,
"calib/mu_w": 0.8714563106796116,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.28071146245059275,
"calib/std_conf": 0.05053622073781924,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7761708860759493,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": -0.018082225957245734,
"calib/step_q_w": 0.794253112033195,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 457.26171875,
"completions/mean_terminated_length": 457.26171875,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.016,
"grad_norm": 0.007930455729365349,
"kl": 0.0013884305953979492,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.004,
"num_tokens": 3491188.0,
"reward": 0.2793067693710327,
"reward_std": 0.16331849992275238,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6710308790206909,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.427261084318161,
"step": 15
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.47225090345895715,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.27375494071146245,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2490118577075099,
"calib/gap": 0.0075025813113061,
"calib/mean_conf": 0.862687747035573,
"calib/mu_c": 0.8657718120805369,
"calib/mu_w": 0.8582692307692308,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.27375494071146245,
"calib/std_conf": 0.08306795951084686,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7630601792573624,
"calib/step_q_c_n": 781.0,
"calib/step_q_gap": 0.005200330772513917,
"calib/step_q_w": 0.7578598484848484,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2105.0,
"completions/max_terminated_length": 2105.0,
"completions/mean_length": 607.16015625,
"completions/mean_terminated_length": 609.5411987304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.006173198111355305,
"kl": 0.0014122724533081055,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0132,
"num_tokens": 3755469.0,
"reward": 0.28060784935951233,
"reward_std": 0.15960822999477386,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.66645348072052,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.41773784160614014,
"step": 16
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5471626733921816,
"calib/avg_num_step_conf": 4.421875,
"calib/ece": 0.15024193548387096,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.1774193548387097,
"calib/gap": -0.0007297183690626596,
"calib/mean_conf": 0.855,
"calib/mu_c": 0.8548087431693989,
"calib/mu_w": 0.8555384615384616,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.13366935483870968,
"calib/std_conf": 0.06292596776653371,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.7674694376528117,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.030176444022238402,
"calib/step_q_w": 0.7372929936305733,
"calib/step_q_w_n": 314.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3049.0,
"completions/max_terminated_length": 3049.0,
"completions/mean_length": 511.23046875,
"completions/mean_terminated_length": 517.29248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.007242945488542318,
"kl": 0.0038924217224121094,
"learning_rate": 4.25e-06,
"loss": 0.0083,
"num_tokens": 3989872.0,
"reward": 0.3711223602294922,
"reward_std": 0.17457322776317596,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.74609375,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.3358803391456604,
"step": 17
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4884920634920635,
"calib/avg_num_step_conf": 3.5625,
"calib/ece": 0.31554655870445353,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.16194331983805668,
"calib/gap": 0.00042129629629628074,
"calib/mean_conf": 0.8621052631578948,
"calib/mu_c": 0.8622962962962963,
"calib/mu_w": 0.8618750000000001,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.31554655870445353,
"calib/std_conf": 0.051389804403117426,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.7643207126948777,
"calib/step_q_c_n": 449.0,
"calib/step_q_gap": 0.0965669329972535,
"calib/step_q_w": 0.6677537796976242,
"calib/step_q_w_n": 463.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2532.0,
"completions/max_terminated_length": 2532.0,
"completions/mean_length": 487.09375,
"completions/mean_terminated_length": 492.8695983886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0192,
"grad_norm": 0.007715197745710611,
"kl": 0.0037174224853515625,
"learning_rate": 4.5e-06,
"loss": 0.0471,
"num_tokens": 4225288.0,
"reward": 0.24096588790416718,
"reward_std": 0.15334269404411316,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6134636998176575,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.4260631203651428,
"step": 18
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4771100164203613,
"calib/avg_num_step_conf": 3.140625,
"calib/ece": 0.28051999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.18,
"calib/gap": 0.011359605911330006,
"calib/mean_conf": 0.8551599999999999,
"calib/mu_c": 0.8599310344827586,
"calib/mu_w": 0.8485714285714286,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.2778399999999999,
"calib/std_conf": 0.09371112207203582,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7566736842105263,
"calib/step_q_c_n": 475.0,
"calib/step_q_gap": 0.01700803071508561,
"calib/step_q_w": 0.7396656534954407,
"calib/step_q_w_n": 329.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 458.39453125,
"completions/mean_terminated_length": 460.1921691894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.007385259959846735,
"kl": 0.0075931549072265625,
"learning_rate": 4.75e-06,
"loss": 0.0268,
"num_tokens": 4447397.0,
"reward": 0.279776006937027,
"reward_std": 0.19024288654327393,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6482691764831543,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.39340460300445557,
"step": 19
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.43082368082368083,
"calib/avg_num_step_conf": 3.58984375,
"calib/ece": 0.26715415019762856,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.16600790513833993,
"calib/gap": 0.0013693693693692666,
"calib/mean_conf": 0.8521343873517787,
"calib/mu_c": 0.8527027027027028,
"calib/mu_w": 0.8513333333333335,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.26715415019762856,
"calib/std_conf": 0.07074553382822461,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7535021097046413,
"calib/step_q_c_n": 474.0,
"calib/step_q_gap": 0.04516503105295577,
"calib/step_q_w": 0.7083370786516855,
"calib/step_q_w_n": 445.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3032.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 434.2578125,
"completions/mean_terminated_length": 434.2578125,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.007737329229712486,
"kl": 0.00714874267578125,
"learning_rate": 5e-06,
"loss": -0.006,
"num_tokens": 4663439.0,
"reward": 0.29337871074676514,
"reward_std": 0.14856009185314178,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6688362956047058,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3937976360321045,
"step": 20
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.506896551724138,
"calib/avg_num_step_conf": 3.28125,
"calib/ece": 0.19932539682539668,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.1626984126984127,
"calib/gap": 0.0032706374085688106,
"calib/mean_conf": 0.8509920634920635,
"calib/mu_c": 0.8521212121212123,
"calib/mu_w": 0.8488505747126435,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.19777777777777764,
"calib/std_conf": 0.05771619068485525,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7341996233521657,
"calib/step_q_c_n": 531.0,
"calib/step_q_gap": 0.001448814290677003,
"calib/step_q_w": 0.7327508090614887,
"calib/step_q_w_n": 309.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2639.0,
"completions/max_terminated_length": 2639.0,
"completions/mean_length": 456.33203125,
"completions/mean_terminated_length": 456.33203125,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0224,
"grad_norm": 0.0093642957508564,
"kl": 0.015064239501953125,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0232,
"num_tokens": 4883220.0,
"reward": 0.3449612259864807,
"reward_std": 0.19086401164531708,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7126258015632629,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3453596234321594,
"step": 21
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5431784943241172,
"calib/avg_num_step_conf": 3.0234375,
"calib/ece": 0.17089843749999994,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.11328125,
"calib/gap": 0.012915244794205805,
"calib/mean_conf": 0.8466796875,
"calib/mu_c": 0.8508670520231213,
"calib/mu_w": 0.8379518072289155,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17089843749999994,
"calib/std_conf": 0.0530285055409102,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7707037037037037,
"calib/step_q_c_n": 540.0,
"calib/step_q_gap": 0.04271225071225071,
"calib/step_q_w": 0.727991452991453,
"calib/step_q_w_n": 234.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 926.0,
"completions/max_terminated_length": 926.0,
"completions/mean_length": 413.66796875,
"completions/mean_terminated_length": 415.29022216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.008529752492904663,
"kl": 0.0120086669921875,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0041,
"num_tokens": 5090935.0,
"reward": 0.3591231405735016,
"reward_std": 0.14010533690452576,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.754540205001831,
"rewards/format_reward_step": 1.0,
"rewards/step_l1_reward": -0.37145018577575684,
"step": 22
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5214104108122575,
"calib/avg_num_step_conf": 2.8984375,
"calib/ece": 0.2740485829959515,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.10121457489878542,
"calib/gap": 0.0029365716579685675,
"calib/mean_conf": 0.8438461538461539,
"calib/mu_c": 0.8451063829787234,
"calib/mu_w": 0.8421698113207549,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.27352226720647776,
"calib/std_conf": 0.056311961962853735,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.7343103448275862,
"calib/step_q_c_n": 406.0,
"calib/step_q_gap": 0.006215106732348152,
"calib/step_q_w": 0.728095238095238,
"calib/step_q_w_n": 336.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 472.8125,
"completions/mean_terminated_length": 476.5354309082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.0072917933575809,
"kl": 0.01273345947265625,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0068,
"num_tokens": 5315911.0,
"reward": 0.26862412691116333,
"reward_std": 0.19663289189338684,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.631864070892334,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.3930532932281494,
"step": 23
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5548489131166296,
"calib/avg_num_step_conf": 2.79296875,
"calib/ece": 0.31856557377049183,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.12704918032786885,
"calib/gap": 0.01096641765933104,
"calib/mean_conf": 0.8314344262295084,
"calib/mu_c": 0.8366929133858269,
"calib/mu_w": 0.8257264957264958,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.31475409836065577,
"calib/std_conf": 0.07833298906265776,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.7263687150837987,
"calib/step_q_c_n": 358.0,
"calib/step_q_gap": 0.0031194153639106936,
"calib/step_q_w": 0.723249299719888,
"calib/step_q_w_n": 357.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 507.5859375,
"completions/mean_terminated_length": 519.7680053710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.0256,
"grad_norm": 0.0077531770803034306,
"kl": 0.011515617370605469,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0301,
"num_tokens": 5550365.0,
"reward": 0.25186437368392944,
"reward_std": 0.2193448543548584,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6128253936767578,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": -0.395815372467041,
"step": 24
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.47339075854700857,
"calib/avg_num_step_conf": 3.046875,
"calib/ece": 0.23178571428571426,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": -0.0008333333333332416,
"calib/mean_conf": 0.8478174603174603,
"calib/mu_c": 0.8475,
"calib/mu_w": 0.8483333333333333,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23027777777777775,
"calib/std_conf": 0.06529237063693799,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7341647058823529,
"calib/step_q_c_n": 425.0,
"calib/step_q_gap": 0.009347804473902266,
"calib/step_q_w": 0.7248169014084507,
"calib/step_q_w_n": 355.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2761.0,
"completions/max_terminated_length": 2761.0,
"completions/mean_length": 452.5703125,
"completions/mean_terminated_length": 452.5703125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.008476924151182175,
"kl": 0.014321327209472656,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0502,
"num_tokens": 5769447.0,
"reward": 0.3191983699798584,
"reward_std": 0.16112613677978516,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6910125017166138,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.36902207136154175,
"step": 25
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5351290322580645,
"calib/avg_num_step_conf": 2.77734375,
"calib/ece": 0.23690196078431378,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.11764705882352941,
"calib/gap": 0.013729032258064544,
"calib/mean_conf": 0.8447450980392157,
"calib/mu_c": 0.8501290322580646,
"calib/mu_w": 0.8364,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23690196078431378,
"calib/std_conf": 0.05971692804874195,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7648410757946209,
"calib/step_q_c_n": 409.0,
"calib/step_q_gap": 0.0505033274502501,
"calib/step_q_w": 0.7143377483443708,
"calib/step_q_w_n": 302.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1222.0,
"completions/max_terminated_length": 1222.0,
"completions/mean_length": 433.0078125,
"completions/mean_terminated_length": 434.7059020996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.07723195105791092,
"kl": 0.06496047973632812,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0067,
"num_tokens": 5985537.0,
"reward": 0.3232446312904358,
"reward_std": 0.1632021963596344,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7019386291503906,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.37419939041137695,
"step": 26
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5066225165562914,
"calib/avg_num_step_conf": 3.1015625,
"calib/ece": 0.24322834645669295,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.07874015748031496,
"calib/gap": -0.0005748087185750705,
"calib/mean_conf": 0.8377165354330709,
"calib/mu_c": 0.8374834437086093,
"calib/mu_w": 0.8380582524271843,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.24322834645669295,
"calib/std_conf": 0.05801423903925538,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7444008264462809,
"calib/step_q_c_n": 484.0,
"calib/step_q_gap": 0.03998147160757126,
"calib/step_q_w": 0.7044193548387097,
"calib/step_q_w_n": 310.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2068.0,
"completions/max_terminated_length": 2068.0,
"completions/mean_length": 468.66015625,
"completions/mean_terminated_length": 468.66015625,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.0288,
"grad_norm": 0.006958463229238987,
"kl": 0.013710975646972656,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0335,
"num_tokens": 6210730.0,
"reward": 0.301924467086792,
"reward_std": 0.2039233148097992,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6822699308395386,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3924834728240967,
"step": 27
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5399909111565553,
"calib/avg_num_step_conf": 2.33984375,
"calib/ece": 0.18122950819672146,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.10245901639344263,
"calib/gap": 0.010517306672726212,
"calib/mean_conf": 0.842704918032787,
"calib/mu_c": 0.846196319018405,
"calib/mu_w": 0.8356790123456788,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.1779508196721313,
"calib/std_conf": 0.07354085387679224,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.7491463414634147,
"calib/step_q_c_n": 410.0,
"calib/step_q_gap": 0.016924119241192304,
"calib/step_q_w": 0.7322222222222224,
"calib/step_q_w_n": 189.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2981.0,
"completions/max_terminated_length": 2981.0,
"completions/mean_length": 521.23046875,
"completions/mean_terminated_length": 521.23046875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.006662163883447647,
"kl": 0.009778022766113281,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0248,
"num_tokens": 6451109.0,
"reward": 0.3436114192008972,
"reward_std": 0.1427958607673645,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6973953247070312,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": -0.32345378398895264,
"step": 28
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5013983739837398,
"calib/avg_num_step_conf": 3.16015625,
"calib/ece": 0.35213709677419347,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.1532258064516129,
"calib/gap": 0.0003726829268290155,
"calib/mean_conf": 0.8481048387096773,
"calib/mu_c": 0.8482926829268291,
"calib/mu_w": 0.8479200000000001,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.35213709677419347,
"calib/std_conf": 0.06029528395247401,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6996764705882352,
"calib/step_q_c_n": 340.0,
"calib/step_q_gap": 0.00660610811488771,
"calib/step_q_w": 0.6930703624733475,
"calib/step_q_w_n": 469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 540.2890625,
"completions/mean_terminated_length": 542.4078979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.006647651549428701,
"kl": 0.011612892150878906,
"learning_rate": 4.75e-06,
"loss": 0.0118,
"num_tokens": 6696551.0,
"reward": 0.25028783082962036,
"reward_std": 0.2021082192659378,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.5928003787994385,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.3789435029029846,
"step": 29
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.46759906759906755,
"calib/avg_num_step_conf": 3.375,
"calib/ece": 0.2636693548387095,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.10483870967741936,
"calib/gap": 0.010287712287712414,
"calib/mean_conf": 0.8322177419354838,
"calib/mu_c": 0.8365734265734266,
"calib/mu_w": 0.8262857142857142,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2596370967741934,
"calib/std_conf": 0.10654925110497106,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7221413721413722,
"calib/step_q_c_n": 481.0,
"calib/step_q_gap": 0.05825103271578458,
"calib/step_q_w": 0.6638903394255876,
"calib/step_q_w_n": 383.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2996.0,
"completions/max_terminated_length": 2996.0,
"completions/mean_length": 562.6953125,
"completions/mean_terminated_length": 569.3676147460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.032,
"grad_norm": 0.006563671864569187,
"kl": 0.010916709899902344,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0258,
"num_tokens": 6947585.0,
"reward": 0.28651654720306396,
"reward_std": 0.19402888417243958,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6579402685165405,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3888133466243744,
"step": 30
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5641335752200497,
"calib/avg_num_step_conf": 3.38671875,
"calib/ece": 0.3468852459016394,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.12295081967213115,
"calib/gap": 0.020538869851508545,
"calib/mean_conf": 0.8427868852459017,
"calib/mu_c": 0.8531404958677686,
"calib/mu_w": 0.8326016260162601,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3468852459016394,
"calib/std_conf": 0.07599894622955784,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6967750000000001,
"calib/step_q_c_n": 400.0,
"calib/step_q_gap": 0.028595128479657506,
"calib/step_q_w": 0.6681798715203426,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 563.71875,
"completions/mean_terminated_length": 574.9482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.006196488626301289,
"kl": 0.012213706970214844,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0209,
"num_tokens": 7197809.0,
"reward": 0.23471850156784058,
"reward_std": 0.1854703426361084,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.59673011302948,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.41088682413101196,
"step": 31
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5799528001020539,
"calib/avg_num_step_conf": 2.91796875,
"calib/ece": 0.30733067729083663,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.11155378486055777,
"calib/gap": 0.028174512055109346,
"calib/mean_conf": 0.8411952191235059,
"calib/mu_c": 0.8543283582089554,
"calib/mu_w": 0.826153846153846,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.30733067729083663,
"calib/std_conf": 0.08186670329637882,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7777472527472528,
"calib/step_q_c_n": 364.0,
"calib/step_q_gap": 0.07980991593263154,
"calib/step_q_w": 0.6979373368146212,
"calib/step_q_w_n": 383.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2818.0,
"completions/max_terminated_length": 2818.0,
"completions/mean_length": 516.00390625,
"completions/mean_terminated_length": 524.1944580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.007397293113172054,
"kl": 0.017431259155273438,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0133,
"num_tokens": 7436610.0,
"reward": 0.2804139256477356,
"reward_std": 0.17597326636314392,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6451066732406616,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.38271623849868774,
"step": 32
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47163318452380953,
"calib/avg_num_step_conf": 3.24609375,
"calib/ece": 0.3450393700787403,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.1220472440944882,
"calib/gap": -0.005338541666666585,
"calib/mean_conf": 0.8489763779527559,
"calib/mu_c": 0.846328125,
"calib/mu_w": 0.8516666666666666,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3450393700787403,
"calib/std_conf": 0.05202716803413796,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7498271604938274,
"calib/step_q_c_n": 405.0,
"calib/step_q_gap": 0.0334421839680058,
"calib/step_q_w": 0.7163849765258216,
"calib/step_q_w_n": 426.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1187.0,
"completions/max_terminated_length": 1187.0,
"completions/mean_length": 493.765625,
"completions/mean_terminated_length": 497.6535339355469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0352,
"grad_norm": 0.006877983920276165,
"kl": 0.01480865478515625,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0006,
"num_tokens": 7669886.0,
"reward": 0.2519468367099762,
"reward_std": 0.20320287346839905,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.619293749332428,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.41305631399154663,
"step": 33
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5368427868427869,
"calib/avg_num_step_conf": 3.515625,
"calib/ece": 0.2832270916334661,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.11553784860557768,
"calib/gap": 0.014276094276094331,
"calib/mean_conf": 0.8529482071713148,
"calib/mu_c": 0.8590909090909091,
"calib/mu_w": 0.8448148148148148,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2832270916334661,
"calib/std_conf": 0.057657400249739615,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7596969696969698,
"calib/step_q_c_n": 462.0,
"calib/step_q_gap": 0.06031340805313423,
"calib/step_q_w": 0.6993835616438355,
"calib/step_q_w_n": 438.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2568.0,
"completions/max_terminated_length": 2568.0,
"completions/mean_length": 483.65234375,
"completions/mean_terminated_length": 487.46063232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.07357311248779297,
"kl": 0.1785106658935547,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0134,
"num_tokens": 7898813.0,
"reward": 0.3066682517528534,
"reward_std": 0.2058257907629013,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6612108945846558,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.35490572452545166,
"step": 34
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5123316197045618,
"calib/avg_num_step_conf": 2.67578125,
"calib/ece": 0.32262096774193544,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.07661290322580645,
"calib/gap": 0.01582937463395584,
"calib/mean_conf": 0.8347177419354839,
"calib/mu_c": 0.8424409448818898,
"calib/mu_w": 0.826611570247934,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.32262096774193544,
"calib/std_conf": 0.07561522509302532,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7379228486646885,
"calib/step_q_c_n": 337.0,
"calib/step_q_gap": 0.03481940038882636,
"calib/step_q_w": 0.7031034482758621,
"calib/step_q_w_n": 348.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 510.4453125,
"completions/mean_terminated_length": 516.498046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.006531782913953066,
"kl": 0.016582489013671875,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0256,
"num_tokens": 8138743.0,
"reward": 0.24612952768802643,
"reward_std": 0.2067875862121582,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6203484535217285,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.42027685046195984,
"step": 35
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6350965250965253,
"calib/avg_num_step_conf": 3.0703125,
"calib/ece": 0.12921568627450977,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.13333333333333333,
"calib/gap": 0.03343629343629351,
"calib/mean_conf": 0.8496862745098038,
"calib/mu_c": 0.8588648648648648,
"calib/mu_w": 0.8254285714285713,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12670588235294114,
"calib/std_conf": 0.07523492362712644,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7731095406360424,
"calib/step_q_c_n": 566.0,
"calib/step_q_gap": 0.06288226790876972,
"calib/step_q_w": 0.7102272727272727,
"calib/step_q_w_n": 220.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2080.0,
"completions/max_terminated_length": 2080.0,
"completions/mean_length": 478.0546875,
"completions/mean_terminated_length": 479.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.0384,
"grad_norm": 0.007612496614456177,
"kl": 0.021648406982421875,
"learning_rate": 4.555555555555556e-06,
"loss": 0.025,
"num_tokens": 8363837.0,
"reward": 0.41307783126831055,
"reward_std": 0.17080523073673248,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.787934422492981,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.30474740266799927,
"step": 36
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6120358090185677,
"calib/avg_num_step_conf": 3.1328125,
"calib/ece": 0.36178861788617883,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.08536585365853659,
"calib/gap": 0.0403474801061009,
"calib/mean_conf": 0.8333333333333334,
"calib/mu_c": 0.8546551724137932,
"calib/mu_w": 0.8143076923076923,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.36178861788617883,
"calib/std_conf": 0.10269274296655329,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.78,
"calib/step_q_c_n": 342.0,
"calib/step_q_gap": 0.17943478260869572,
"calib/step_q_w": 0.6005652173913043,
"calib/step_q_w_n": 460.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 560.125,
"completions/mean_terminated_length": 562.3215942382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.006658955942839384,
"kl": 0.017736434936523438,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0064,
"num_tokens": 8614325.0,
"reward": 0.25001418590545654,
"reward_std": 0.1737680435180664,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6048921942710876,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3876763582229614,
"step": 37
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5477272727272727,
"calib/avg_num_step_conf": 2.8203125,
"calib/ece": 0.29455645161290317,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.1693548387096774,
"calib/gap": 0.019081686429512468,
"calib/mean_conf": 0.8390725806451613,
"calib/mu_c": 0.847536231884058,
"calib/mu_w": 0.8284545454545456,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2885887096774193,
"calib/std_conf": 0.11037850373550395,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7636613272311213,
"calib/step_q_c_n": 437.0,
"calib/step_q_gap": 0.054924485125858125,
"calib/step_q_w": 0.7087368421052632,
"calib/step_q_w_n": 285.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 496.72265625,
"completions/mean_terminated_length": 502.6126708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.007322958204895258,
"kl": 0.028581619262695312,
"learning_rate": 4.5e-06,
"loss": 0.0158,
"num_tokens": 8848374.0,
"reward": 0.2834533452987671,
"reward_std": 0.18513981997966766,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6495933532714844,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3850303888320923,
"step": 38
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6027422990232908,
"calib/avg_num_step_conf": 2.62109375,
"calib/ece": 0.36640316205533596,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.19367588932806323,
"calib/gap": 0.04174931129476589,
"calib/mean_conf": 0.8392094861660079,
"calib/mu_c": 0.86099173553719,
"calib/mu_w": 0.8192424242424241,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3636758893280632,
"calib/std_conf": 0.12069695329426747,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7932542372881356,
"calib/step_q_c_n": 295.0,
"calib/step_q_gap": 0.07586062026685902,
"calib/step_q_w": 0.7173936170212766,
"calib/step_q_w_n": 376.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1922.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 501.16015625,
"completions/mean_terminated_length": 501.16015625,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.0416,
"grad_norm": 0.006672346033155918,
"kl": 0.020376205444335938,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0053,
"num_tokens": 9082759.0,
"reward": 0.2530670166015625,
"reward_std": 0.17842620611190796,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6115871071815491,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.39607805013656616,
"step": 39
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5051023622047244,
"calib/avg_num_step_conf": 2.37109375,
"calib/ece": 0.3546031746031745,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.15476190476190477,
"calib/gap": 0.014383622047244082,
"calib/mean_conf": 0.8461111111111111,
"calib/mu_c": 0.85336,
"calib/mu_w": 0.8389763779527559,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3523412698412697,
"calib/std_conf": 0.08938921690430807,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.76053125,
"calib/step_q_c_n": 320.0,
"calib/step_q_gap": -0.016855509581881534,
"calib/step_q_w": 0.7773867595818815,
"calib/step_q_w_n": 287.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2447.0,
"completions/max_terminated_length": 2447.0,
"completions/mean_length": 512.81640625,
"completions/mean_terminated_length": 514.8274536132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.006264918949455023,
"kl": 0.024440765380859375,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0076,
"num_tokens": 9320800.0,
"reward": 0.23327161371707916,
"reward_std": 0.20680826902389526,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6144461035728455,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.44087162613868713,
"step": 40
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4858774267224972,
"calib/avg_num_step_conf": 2.4375,
"calib/ece": 0.14285156249999992,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.27734375,
"calib/gap": 0.027073467834031195,
"calib/mean_conf": 0.8530859375,
"calib/mu_c": 0.8605945945945945,
"calib/mu_w": 0.8335211267605633,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1366406249999999,
"calib/std_conf": 0.11130374596008032,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7796583143507972,
"calib/step_q_c_n": 439.0,
"calib/step_q_gap": 0.07452317921566198,
"calib/step_q_w": 0.7051351351351353,
"calib/step_q_w_n": 185.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1122.0,
"completions/max_terminated_length": 1122.0,
"completions/mean_length": 444.66796875,
"completions/mean_terminated_length": 446.41180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.006443468388170004,
"kl": 0.027767181396484375,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0064,
"num_tokens": 9541883.0,
"reward": 0.4025619328022003,
"reward_std": 0.21940746903419495,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7732605338096619,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3095429241657257,
"step": 41
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5870038412291932,
"calib/avg_num_step_conf": 2.6875,
"calib/ece": 0.30221428571428577,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": 0.020968245838668298,
"calib/mean_conf": 0.8657063492063491,
"calib/mu_c": 0.8748591549295774,
"calib/mu_w": 0.8538909090909091,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30221428571428577,
"calib/std_conf": 0.06530873777320191,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7882065217391303,
"calib/step_q_c_n": 368.0,
"calib/step_q_gap": 0.03461902173913034,
"calib/step_q_w": 0.7535875,
"calib/step_q_w_n": 320.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1991.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 417.30078125,
"completions/mean_terminated_length": 420.58660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0448,
"grad_norm": 0.006725949700921774,
"kl": 0.030731201171875,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0069,
"num_tokens": 9753080.0,
"reward": 0.2923615574836731,
"reward_std": 0.1730237454175949,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6582983732223511,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3813877999782562,
"step": 42
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6098401143897049,
"calib/avg_num_step_conf": 2.5,
"calib/ece": 0.22211764705882345,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.20784313725490197,
"calib/gap": 0.05029702326790575,
"calib/mean_conf": 0.8378039215686275,
"calib/mu_c": 0.8571337579617834,
"calib/mu_w": 0.8068367346938776,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22211764705882345,
"calib/std_conf": 0.11507150711330402,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7522802850356294,
"calib/step_q_c_n": 421.0,
"calib/step_q_gap": -0.003747112224644522,
"calib/step_q_w": 0.7560273972602739,
"calib/step_q_w_n": 219.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1255.0,
"completions/max_terminated_length": 1255.0,
"completions/mean_length": 475.69140625,
"completions/mean_terminated_length": 477.556884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.006733867339789867,
"kl": 0.025842666625976562,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0272,
"num_tokens": 9980081.0,
"reward": 0.3314400315284729,
"reward_std": 0.22022783756256104,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7135285139083862,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.370179682970047,
"step": 43
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5617777777777777,
"calib/avg_num_step_conf": 2.71484375,
"calib/ece": 0.36709163346613544,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.2788844621513944,
"calib/gap": 0.026170793650793667,
"calib/mean_conf": 0.855617529880478,
"calib/mu_c": 0.8686507936507937,
"calib/mu_w": 0.84248,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3603585657370518,
"calib/std_conf": 0.11360410889243894,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7515193370165746,
"calib/step_q_c_n": 362.0,
"calib/step_q_gap": 0.06743525293249042,
"calib/step_q_w": 0.6840840840840842,
"calib/step_q_w_n": 333.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2917.0,
"completions/max_terminated_length": 2917.0,
"completions/mean_length": 513.328125,
"completions/mean_terminated_length": 519.4150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.006369642447680235,
"kl": 0.026624679565429688,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0139,
"num_tokens": 10217813.0,
"reward": 0.24526745080947876,
"reward_std": 0.20343050360679626,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6108214855194092,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.4140365719795227,
"step": 44
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5850052110474205,
"calib/avg_num_step_conf": 2.69921875,
"calib/ece": 0.23276679841897235,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.19367588932806323,
"calib/gap": 0.08407959874934867,
"calib/mean_conf": 0.8303162055335969,
"calib/mu_c": 0.8638815789473684,
"calib/mu_w": 0.7798019801980197,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23114624505928857,
"calib/std_conf": 0.16293715541536571,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8120765027322405,
"calib/step_q_c_n": 366.0,
"calib/step_q_gap": 0.13967650273224053,
"calib/step_q_w": 0.6724,
"calib/step_q_w_n": 325.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1983.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 484.37109375,
"completions/mean_terminated_length": 486.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.048,
"grad_norm": 0.006604231894016266,
"kl": 0.028827667236328125,
"learning_rate": 4.305555555555556e-06,
"loss": 0.0122,
"num_tokens": 10446860.0,
"reward": 0.3432024121284485,
"reward_std": 0.18877151608467102,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7128074169158936,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3428088426589966,
"step": 45
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.622137661637931,
"calib/avg_num_step_conf": 2.48828125,
"calib/ece": 0.3403688524590163,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.27049180327868855,
"calib/gap": 0.02542025862068953,
"calib/mean_conf": 0.8513524590163934,
"calib/mu_c": 0.8634375,
"calib/mu_w": 0.8380172413793104,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.33356557377049173,
"calib/std_conf": 0.11088418283123143,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7774074074074074,
"calib/step_q_c_n": 324.0,
"calib/step_q_gap": 0.04565021890900489,
"calib/step_q_w": 0.7317571884984025,
"calib/step_q_w_n": 313.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 520.828125,
"completions/mean_terminated_length": 524.9291381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.006163495592772961,
"kl": 0.029521942138671875,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0366,
"num_tokens": 10684960.0,
"reward": 0.25145965814590454,
"reward_std": 0.20970991253852844,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6090492606163025,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.39519235491752625,
"step": 46
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5917415577342048,
"calib/avg_num_step_conf": 2.20703125,
"calib/ece": 0.24120481927710852,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3132530120481928,
"calib/gap": 0.0656372549019607,
"calib/mean_conf": 0.8511646586345382,
"calib/mu_c": 0.8764705882352941,
"calib/mu_w": 0.8108333333333334,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.23895582329317278,
"calib/std_conf": 0.13462072632327893,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.7662576687116565,
"calib/step_q_c_n": 326.0,
"calib/step_q_gap": 0.09291038837692855,
"calib/step_q_w": 0.6733472803347279,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 505.50390625,
"completions/mean_terminated_length": 505.50390625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.006543243769556284,
"kl": 0.030582427978515625,
"learning_rate": 4.25e-06,
"loss": 0.0041,
"num_tokens": 10920345.0,
"reward": 0.3438766896724701,
"reward_std": 0.15386797487735748,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6899590492248535,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3115805983543396,
"step": 47
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5662878787878788,
"calib/avg_num_step_conf": 2.53125,
"calib/ece": 0.39715999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.292,
"calib/gap": 0.007679763739085632,
"calib/mean_conf": 0.84764,
"calib/mu_c": 0.8516949152542371,
"calib/mu_w": 0.8440151515151515,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.38639999999999997,
"calib/std_conf": 0.12591914230965837,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7769932432432433,
"calib/step_q_c_n": 296.0,
"calib/step_q_gap": 0.057533015970515855,
"calib/step_q_w": 0.7194602272727274,
"calib/step_q_w_n": 352.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2900.0,
"completions/max_terminated_length": 2900.0,
"completions/mean_length": 480.625,
"completions/mean_terminated_length": 484.4094543457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.0512,
"grad_norm": 0.007595698349177837,
"kl": 0.03356170654296875,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0104,
"num_tokens": 11147073.0,
"reward": 0.22138309478759766,
"reward_std": 0.23210257291793823,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5816078186035156,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.4239978492259979,
"step": 48
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5904684450227716,
"calib/avg_num_step_conf": 2.63671875,
"calib/ece": 0.29737051792828684,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.33067729083665337,
"calib/gap": 0.0371346779440469,
"calib/mean_conf": 0.8674900398406374,
"calib/mu_c": 0.8831724137931035,
"calib/mu_w": 0.8460377358490566,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2935856573705179,
"calib/std_conf": 0.10301275836929619,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7889066666666666,
"calib/step_q_c_n": 375.0,
"calib/step_q_gap": 0.05550666666666659,
"calib/step_q_w": 0.7334,
"calib/step_q_w_n": 300.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 474.73828125,
"completions/mean_terminated_length": 476.60003662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.007054316811263561,
"kl": 0.032970428466796875,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0348,
"num_tokens": 11373142.0,
"reward": 0.30179956555366516,
"reward_std": 0.1905430108308792,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.657378077507019,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.3600289821624756,
"step": 49
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5599271012006861,
"calib/avg_num_step_conf": 2.3203125,
"calib/ece": 0.24437246963562745,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.3360323886639676,
"calib/gap": 0.021552315608919348,
"calib/mean_conf": 0.88251012145749,
"calib/mu_c": 0.890188679245283,
"calib/mu_w": 0.8686363636363637,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.24157894736842098,
"calib/std_conf": 0.07987955824040092,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.8135911602209945,
"calib/step_q_c_n": 362.0,
"calib/step_q_gap": 0.0036342636692704433,
"calib/step_q_w": 0.8099568965517241,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2032.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 500.54296875,
"completions/mean_terminated_length": 504.4842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.006964292377233505,
"kl": 0.030660629272460938,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0509,
"num_tokens": 11606641.0,
"reward": 0.3266167640686035,
"reward_std": 0.21096104383468628,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6699769496917725,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.32924342155456543,
"step": 50
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5441904761904762,
"calib/avg_num_step_conf": 2.4921875,
"calib/ece": 0.2988235294117647,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.43137254901960786,
"calib/gap": 0.03331428571428563,
"calib/mean_conf": 0.8698823529411764,
"calib/mu_c": 0.8835999999999999,
"calib/mu_w": 0.8502857142857143,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.29023529411764704,
"calib/std_conf": 0.13659868903266995,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8239673913043479,
"calib/step_q_c_n": 368.0,
"calib/step_q_gap": 0.0506340579710145,
"calib/step_q_w": 0.7733333333333334,
"calib/step_q_w_n": 270.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 480.13671875,
"completions/mean_terminated_length": 480.13671875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.0544,
"grad_norm": 0.007857617922127247,
"kl": 0.0311737060546875,
"learning_rate": 4.138888888888889e-06,
"loss": 0.021,
"num_tokens": 11838852.0,
"reward": 0.3062852919101715,
"reward_std": 0.232842355966568,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.67255699634552,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.37561148405075073,
"step": 51
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6458100558659218,
"calib/avg_num_step_conf": 2.4453125,
"calib/ece": 0.13555118110236228,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.35826771653543305,
"calib/gap": 0.13500111731843578,
"calib/mean_conf": 0.8263385826771653,
"calib/mu_c": 0.8662011173184359,
"calib/mu_w": 0.7312000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1285826771653544,
"calib/std_conf": 0.18965047504450225,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.800990099009901,
"calib/step_q_c_n": 404.0,
"calib/step_q_gap": 0.18676036928017137,
"calib/step_q_w": 0.6142297297297297,
"calib/step_q_w_n": 222.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2595.0,
"completions/max_terminated_length": 2595.0,
"completions/mean_length": 473.12109375,
"completions/mean_terminated_length": 473.12109375,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.0077812341041862965,
"kl": 0.03348541259765625,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0046,
"num_tokens": 12067923.0,
"reward": 0.41235560178756714,
"reward_std": 0.2180776596069336,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7757226228713989,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.286167711019516,
"step": 52
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6079401925308937,
"calib/avg_num_step_conf": 2.609375,
"calib/ece": 0.23512096774193553,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3346774193548387,
"calib/gap": 0.07823649894176288,
"calib/mean_conf": 0.8342338709677419,
"calib/mu_c": 0.8648344370860928,
"calib/mu_w": 0.7865979381443299,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.230241935483871,
"calib/std_conf": 0.1661525537294629,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8075830815709968,
"calib/step_q_c_n": 331.0,
"calib/step_q_gap": 0.21452670174903832,
"calib/step_q_w": 0.5930563798219585,
"calib/step_q_w_n": 337.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 509.87109375,
"completions/mean_terminated_length": 515.9169921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.007624692749232054,
"kl": 0.034870147705078125,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0483,
"num_tokens": 12304274.0,
"reward": 0.34297817945480347,
"reward_std": 0.19570030272006989,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6943316459655762,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.31931272149086,
"step": 53
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7167325428194995,
"calib/avg_num_step_conf": 2.3203125,
"calib/ece": 0.1405859375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.4375,
"calib/gap": 0.12296597690459599,
"calib/mean_conf": 0.8660546874999999,
"calib/mu_c": 0.8991978609625669,
"calib/mu_w": 0.7762318840579709,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1380859375,
"calib/std_conf": 0.13993462807424523,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8436574074074074,
"calib/step_q_c_n": 432.0,
"calib/step_q_gap": 0.10569444444444431,
"calib/step_q_w": 0.737962962962963,
"calib/step_q_w_n": 162.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1246.0,
"completions/max_terminated_length": 1246.0,
"completions/mean_length": 410.34375,
"completions/mean_terminated_length": 411.9529724121094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.0576,
"grad_norm": 0.013864198699593544,
"kl": 0.08069229125976562,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0006,
"num_tokens": 12515554.0,
"reward": 0.4472993016242981,
"reward_std": 0.17225967347621918,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.8135707378387451,
"rewards/format_reward_step": 1.0,
"rewards/step_l1_reward": -0.26506587862968445,
"step": 54
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7380197768762677,
"calib/avg_num_step_conf": 2.16015625,
"calib/ece": 0.3099603174603175,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.49603174603174605,
"calib/gap": 0.14150354969574064,
"calib/mean_conf": 0.849642857142857,
"calib/mu_c": 0.9147794117647059,
"calib/mu_w": 0.7732758620689653,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3099603174603175,
"calib/std_conf": 0.1763346819772764,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8392075471698113,
"calib/step_q_c_n": 265.0,
"calib/step_q_gap": 0.1505964360587002,
"calib/step_q_w": 0.6886111111111111,
"calib/step_q_w_n": 288.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 448.0078125,
"completions/mean_terminated_length": 453.3201904296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.007659987546503544,
"kl": 0.03458213806152344,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0013,
"num_tokens": 12738068.0,
"reward": 0.3213605284690857,
"reward_std": 0.23816221952438354,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6774871349334717,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3347660303115845,
"step": 55
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6091312533620226,
"calib/avg_num_step_conf": 2.03515625,
"calib/ece": 0.2891902834008097,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4291497975708502,
"calib/gap": 0.05400349650349667,
"calib/mean_conf": 0.8345344129554656,
"calib/mu_c": 0.8572727272727274,
"calib/mu_w": 0.8032692307692307,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.2723886639676113,
"calib/std_conf": 0.18332203758283339,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7831118881118881,
"calib/step_q_c_n": 286.0,
"calib/step_q_gap": 0.06847359023954769,
"calib/step_q_w": 0.7146382978723405,
"calib/step_q_w_n": 235.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2926.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 477.89453125,
"completions/mean_terminated_length": 479.7686462402344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.007571075111627579,
"kl": 0.034942626953125,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0369,
"num_tokens": 12967249.0,
"reward": 0.322052001953125,
"reward_std": 0.19588708877563477,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6582722663879395,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.3172932267189026,
"step": 56
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6391849529780563,
"calib/avg_num_step_conf": 1.97265625,
"calib/ece": 0.2246428571428572,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4801587301587302,
"calib/gap": 0.07466875653082572,
"calib/mean_conf": 0.8767063492063492,
"calib/mu_c": 0.9024848484848487,
"calib/mu_w": 0.827816091954023,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.22329365079365082,
"calib/std_conf": 0.13108167618970798,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.796345029239766,
"calib/step_q_c_n": 342.0,
"calib/step_q_gap": 0.053768342123201696,
"calib/step_q_w": 0.7425766871165643,
"calib/step_q_w_n": 163.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 454.72265625,
"completions/mean_terminated_length": 456.50592041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.0608,
"grad_norm": 0.00668899342417717,
"kl": 0.035167694091796875,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0248,
"num_tokens": 13190450.0,
"reward": 0.36799943447113037,
"reward_std": 0.23135200142860413,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7266637086868286,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3148835003376007,
"step": 57
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.573910719059075,
"calib/avg_num_step_conf": 2.69921875,
"calib/ece": 0.33363265306122447,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5265306122448979,
"calib/gap": 0.056478882651697426,
"calib/mean_conf": 0.8477551020408163,
"calib/mu_c": 0.8744961240310077,
"calib/mu_w": 0.8180172413793103,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.3274285714285714,
"calib/std_conf": 0.18628555330977323,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6834943181818182,
"calib/step_q_c_n": 352.0,
"calib/step_q_gap": 0.05476275475998926,
"calib/step_q_w": 0.6287315634218289,
"calib/step_q_w_n": 339.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 575.109375,
"completions/mean_terminated_length": 575.109375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.0061858044937253,
"kl": 0.03353118896484375,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0023,
"num_tokens": 13443998.0,
"reward": 0.26048970222473145,
"reward_std": 0.2409365326166153,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.609772264957428,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.3801991641521454,
"step": 58
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5171932495876157,
"calib/avg_num_step_conf": 1.828125,
"calib/ece": 0.2977470355731226,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.541501976284585,
"calib/gap": 0.04947151376728853,
"calib/mean_conf": 0.844703557312253,
"calib/mu_c": 0.8664084507042255,
"calib/mu_w": 0.8169369369369369,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.2905928853754941,
"calib/std_conf": 0.1965684139701614,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7663235294117647,
"calib/step_q_c_n": 272.0,
"calib/step_q_gap": 0.009486794717887048,
"calib/step_q_w": 0.7568367346938777,
"calib/step_q_w_n": 196.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2300.0,
"completions/max_terminated_length": 2300.0,
"completions/mean_length": 478.53125,
"completions/mean_terminated_length": 480.4078674316406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.006281663663685322,
"kl": 0.037445068359375,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0161,
"num_tokens": 13672750.0,
"reward": 0.2808471918106079,
"reward_std": 0.2642785310745239,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6390796899795532,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.38285407423973083,
"step": 59
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5813658796852074,
"calib/avg_num_step_conf": 1.953125,
"calib/ece": 0.272570281124498,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5020080321285141,
"calib/gap": 0.06716086434573842,
"calib/mean_conf": 0.8275903614457831,
"calib/mu_c": 0.8551020408163266,
"calib/mu_w": 0.7879411764705881,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.2548995983935743,
"calib/std_conf": 0.2103253763176534,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.77,
"calib/step_q_c_n": 290.0,
"calib/step_q_gap": 0.09714285714285709,
"calib/step_q_w": 0.6728571428571429,
"calib/step_q_w_n": 210.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2951.0,
"completions/max_terminated_length": 2951.0,
"completions/mean_length": 487.67578125,
"completions/mean_terminated_length": 487.67578125,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.064,
"grad_norm": 0.006782458629459143,
"kl": 0.03938865661621094,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0381,
"num_tokens": 13906451.0,
"reward": 0.3251803517341614,
"reward_std": 0.2544691264629364,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6601210832595825,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.3152291178703308,
"step": 60
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5899904083310497,
"calib/avg_num_step_conf": 2.1328125,
"calib/ece": 0.24699604743083,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5612648221343873,
"calib/gap": 0.06016237325294593,
"calib/mean_conf": 0.8630434782608696,
"calib/mu_c": 0.8842073170731707,
"calib/mu_w": 0.8240449438202248,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23090909090909087,
"calib/std_conf": 0.17829402950040316,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7908571428571429,
"calib/step_q_c_n": 315.0,
"calib/step_q_gap": 0.1303809523809525,
"calib/step_q_w": 0.6604761904761904,
"calib/step_q_w_n": 231.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3000.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 427.28515625,
"completions/mean_terminated_length": 427.28515625,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.006563232745975256,
"kl": 0.04006195068359375,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0374,
"num_tokens": 14119900.0,
"reward": 0.3624458909034729,
"reward_std": 0.22378845512866974,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7091094255447388,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.309217631816864,
"step": 61
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49980106100795746,
"calib/avg_num_step_conf": 1.94921875,
"calib/ece": 0.30947791164658645,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5341365461847389,
"calib/gap": 0.02925265251989395,
"calib/mean_conf": 0.8130923694779115,
"calib/mu_c": 0.8253103448275861,
"calib/mu_w": 0.7960576923076922,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.270120481927711,
"calib/std_conf": 0.23478879576435055,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7263306451612904,
"calib/step_q_c_n": 248.0,
"calib/step_q_gap": 0.14445813520113104,
"calib/step_q_w": 0.5818725099601594,
"calib/step_q_w_n": 251.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2698.0,
"completions/max_terminated_length": 2698.0,
"completions/mean_length": 479.015625,
"completions/mean_terminated_length": 480.8941345214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.005922415293753147,
"kl": 0.038631439208984375,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0266,
"num_tokens": 14349608.0,
"reward": 0.2788121998310089,
"reward_std": 0.2546740770339966,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6321789026260376,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.3792419731616974,
"step": 62
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.669293903074518,
"calib/avg_num_step_conf": 1.74609375,
"calib/ece": 0.20695652173913032,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.45454545454545453,
"calib/gap": 0.17424895779051597,
"calib/mean_conf": 0.78300395256917,
"calib/mu_c": 0.8525657894736842,
"calib/mu_w": 0.6783168316831683,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.19458498023715404,
"calib/std_conf": 0.25197807950999856,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.810561797752809,
"calib/step_q_c_n": 267.0,
"calib/step_q_gap": 0.2016729088639202,
"calib/step_q_w": 0.6088888888888888,
"calib/step_q_w_n": 180.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1708.0,
"completions/max_terminated_length": 1708.0,
"completions/mean_length": 493.328125,
"completions/mean_terminated_length": 495.2627868652344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.0672,
"grad_norm": 0.0066199833527207375,
"kl": 0.03546905517578125,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0361,
"num_tokens": 14584540.0,
"reward": 0.3798148036003113,
"reward_std": 0.22585511207580566,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7342566251754761,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.2894707918167114,
"step": 63
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5892217630853994,
"calib/avg_num_step_conf": 1.8359375,
"calib/ece": 0.2399999999999999,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3557312252964427,
"calib/gap": 0.04540151515151514,
"calib/mean_conf": 0.7369960474308299,
"calib/mu_c": 0.7527878787878788,
"calib/mu_w": 0.7073863636363636,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.16241106719367582,
"calib/std_conf": 0.2602956436799597,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7380622837370242,
"calib/step_q_c_n": 289.0,
"calib/step_q_gap": 0.06750979754917885,
"calib/step_q_w": 0.6705524861878454,
"calib/step_q_w_n": 181.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 459.734375,
"completions/mean_terminated_length": 461.53729248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.006584564223885536,
"kl": 0.0449371337890625,
"learning_rate": 3.777777777777778e-06,
"loss": 0.033,
"num_tokens": 14806008.0,
"reward": 0.34331193566322327,
"reward_std": 0.22489362955093384,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7015784978866577,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.33917340636253357,
"step": 64
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6740588421385638,
"calib/avg_num_step_conf": 1.97265625,
"calib/ece": 0.27251968503937,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.41338582677165353,
"calib/gap": 0.10797595697564055,
"calib/mean_conf": 0.8211811023622048,
"calib/mu_c": 0.8675172413793103,
"calib/mu_w": 0.7595412844036697,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.26141732283464564,
"calib/std_conf": 0.19436192464861915,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8445418326693228,
"calib/step_q_c_n": 251.0,
"calib/step_q_gap": 0.12745521849609442,
"calib/step_q_w": 0.7170866141732284,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1166.0,
"completions/max_terminated_length": 1166.0,
"completions/mean_length": 392.15234375,
"completions/mean_terminated_length": 393.6902160644531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.0067816670052707195,
"kl": 0.055877685546875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0072,
"num_tokens": 15011423.0,
"reward": 0.32644250988960266,
"reward_std": 0.17547816038131714,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6920089721679688,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.34849900007247925,
"step": 65
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6454637096774194,
"calib/avg_num_step_conf": 2.078125,
"calib/ece": 0.3492213114754099,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.4180327868852459,
"calib/gap": 0.0714999999999999,
"calib/mean_conf": 0.8076639344262295,
"calib/mu_c": 0.844,
"calib/mu_w": 0.7725000000000001,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3325409836065575,
"calib/std_conf": 0.20432183351295097,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7880275229357798,
"calib/step_q_c_n": 218.0,
"calib/step_q_gap": 0.15923771401858233,
"calib/step_q_w": 0.6287898089171975,
"calib/step_q_w_n": 314.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2553.0,
"completions/max_terminated_length": 2553.0,
"completions/mean_length": 500.94921875,
"completions/mean_terminated_length": 512.9720458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.0704,
"grad_norm": 0.006632333155721426,
"kl": 0.05127716064453125,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0107,
"num_tokens": 15246018.0,
"reward": 0.2660596966743469,
"reward_std": 0.21161139011383057,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6064273118972778,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.35633915662765503,
"step": 66
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7607763023493359,
"calib/avg_num_step_conf": 1.8359375,
"calib/ece": 0.2023228346456694,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4251968503937008,
"calib/gap": 0.13877494041539007,
"calib/mean_conf": 0.8276771653543307,
"calib/mu_c": 0.8763030303030305,
"calib/mu_w": 0.7375280898876404,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19019685039370088,
"calib/std_conf": 0.18565620508470213,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.858125,
"calib/step_q_c_n": 288.0,
"calib/step_q_gap": 0.1948282967032967,
"calib/step_q_w": 0.6632967032967033,
"calib/step_q_w_n": 182.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1531.0,
"completions/max_terminated_length": 1531.0,
"completions/mean_length": 442.3515625,
"completions/mean_terminated_length": 444.0863037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.007070634979754686,
"kl": 0.04443359375,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0114,
"num_tokens": 15464268.0,
"reward": 0.40109509229660034,
"reward_std": 0.18799439072608948,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7620258331298828,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.28639817237854004,
"step": 67
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6540259740259741,
"calib/avg_num_step_conf": 2.21484375,
"calib/ece": 0.2645600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.472,
"calib/gap": 0.11399350649350659,
"calib/mean_conf": 0.8212,
"calib/mu_c": 0.8713571428571429,
"calib/mu_w": 0.7573636363636363,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.26288000000000006,
"calib/std_conf": 0.20073903456976172,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8311111111111111,
"calib/step_q_c_n": 252.0,
"calib/step_q_gap": 0.22873015873015878,
"calib/step_q_w": 0.6023809523809524,
"calib/step_q_w_n": 315.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2498.0,
"completions/max_terminated_length": 2498.0,
"completions/mean_length": 439.55078125,
"completions/mean_terminated_length": 441.2745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.007071278523653746,
"kl": 0.042934417724609375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0096,
"num_tokens": 15680881.0,
"reward": 0.33351147174835205,
"reward_std": 0.19385266304016113,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6848187446594238,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3224833011627197,
"step": 68
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6231396534148828,
"calib/avg_num_step_conf": 1.96875,
"calib/ece": 0.2413524590163934,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.3237704918032787,
"calib/gap": 0.0887210329595649,
"calib/mean_conf": 0.7649590163934428,
"calib/mu_c": 0.8045925925925925,
"calib/mu_w": 0.7158715596330276,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.22651639344262292,
"calib/std_conf": 0.2210509509378051,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.7756903765690376,
"calib/step_q_c_n": 239.0,
"calib/step_q_gap": 0.13474698034262256,
"calib/step_q_w": 0.640943396226415,
"calib/step_q_w_n": 265.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2564.0,
"completions/max_terminated_length": 2564.0,
"completions/mean_length": 544.4296875,
"completions/mean_terminated_length": 557.4960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.0736,
"grad_norm": 0.005974503234028816,
"kl": 0.03606414794921875,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0062,
"num_tokens": 15924751.0,
"reward": 0.31283414363861084,
"reward_std": 0.23490868508815765,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6655257940292358,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.334388792514801,
"step": 69
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7731869014933369,
"calib/avg_num_step_conf": 1.828125,
"calib/ece": 0.19134146341463423,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.36585365853658536,
"calib/gap": 0.26039576776267326,
"calib/mean_conf": 0.7427235772357723,
"calib/mu_c": 0.8581021897810219,
"calib/mu_w": 0.5977064220183487,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1885772357723578,
"calib/std_conf": 0.27380542616123127,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8553478260869566,
"calib/step_q_c_n": 230.0,
"calib/step_q_gap": 0.2660621118012423,
"calib/step_q_w": 0.5892857142857143,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2901.0,
"completions/max_terminated_length": 2901.0,
"completions/mean_length": 545.76953125,
"completions/mean_terminated_length": 547.9098510742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.027560407295823097,
"kl": 0.07869338989257812,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.035,
"num_tokens": 16171460.0,
"reward": 0.37544897198677063,
"reward_std": 0.20757484436035156,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7419331669807434,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.2894727289676666,
"step": 70
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6098376623376623,
"calib/avg_num_step_conf": 1.66796875,
"calib/ece": 0.2484251968503938,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4330708661417323,
"calib/gap": 0.0649558441558441,
"calib/mean_conf": 0.7885826771653544,
"calib/mu_c": 0.8141558441558441,
"calib/mu_w": 0.7492,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.2153543307086615,
"calib/std_conf": 0.23665724711765163,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.803585657370518,
"calib/step_q_c_n": 251.0,
"calib/step_q_gap": 0.08364247555233617,
"calib/step_q_w": 0.7199431818181818,
"calib/step_q_w_n": 176.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1757.0,
"completions/max_terminated_length": 1757.0,
"completions/mean_length": 449.4296875,
"completions/mean_terminated_length": 451.1921691894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.5438715815544128,
"kl": 1.3751029968261719,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0428,
"num_tokens": 16390922.0,
"reward": 0.32291221618652344,
"reward_std": 0.2514145076274872,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6859527230262756,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3557532727718353,
"step": 71
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7759275674306698,
"calib/avg_num_step_conf": 1.671875,
"calib/ece": 0.2571764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.45098039215686275,
"calib/gap": 0.14297581360010092,
"calib/mean_conf": 0.8178823529411765,
"calib/mu_c": 0.8773154362416106,
"calib/mu_w": 0.7343396226415096,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2453725490196078,
"calib/std_conf": 0.19438849791063734,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.835529411764706,
"calib/step_q_c_n": 255.0,
"calib/step_q_gap": 0.09801496089765405,
"calib/step_q_w": 0.7375144508670519,
"calib/step_q_w_n": 173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1742.0,
"completions/max_terminated_length": 1742.0,
"completions/mean_length": 419.8828125,
"completions/mean_terminated_length": 419.8828125,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.0768,
"grad_norm": 0.006729037035256624,
"kl": 0.05239105224609375,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0324,
"num_tokens": 16602820.0,
"reward": 0.37682515382766724,
"reward_std": 0.1965407133102417,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7313547134399414,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.2933293282985687,
"step": 72
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7297959183673469,
"calib/avg_num_step_conf": 1.61328125,
"calib/ece": 0.14845238095238103,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.47619047619047616,
"calib/gap": 0.1524311688311687,
"calib/mean_conf": 0.8398809523809524,
"calib/mu_c": 0.8864571428571428,
"calib/mu_w": 0.7340259740259741,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14694444444444454,
"calib/std_conf": 0.17984005712055565,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8746739130434783,
"calib/step_q_c_n": 276.0,
"calib/step_q_gap": 0.10839654078070471,
"calib/step_q_w": 0.7662773722627736,
"calib/step_q_w_n": 137.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2847.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 430.890625,
"completions/mean_terminated_length": 430.890625,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.006531127728521824,
"kl": 0.050022125244140625,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0332,
"num_tokens": 16820160.0,
"reward": 0.43554988503456116,
"reward_std": 0.2307089865207672,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7826601266860962,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.2443729043006897,
"step": 73
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6947563025210084,
"calib/avg_num_step_conf": 1.671875,
"calib/ece": 0.28581967213114756,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.47950819672131145,
"calib/gap": 0.15664806722689073,
"calib/mean_conf": 0.7800819672131148,
"calib/mu_c": 0.85648,
"calib/mu_w": 0.6998319327731093,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.2768032786885246,
"calib/std_conf": 0.24842288080530242,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.8335960591133006,
"calib/step_q_c_n": 203.0,
"calib/step_q_gap": 0.17937383689107844,
"calib/step_q_w": 0.6542222222222221,
"calib/step_q_w_n": 225.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2688.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 470.27734375,
"completions/mean_terminated_length": 473.9803161621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.006616615690290928,
"kl": 0.0474090576171875,
"learning_rate": 3.5e-06,
"loss": 0.0049,
"num_tokens": 17044479.0,
"reward": 0.3096698522567749,
"reward_std": 0.24875548481941223,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6624257564544678,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.3313673734664917,
"step": 74
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7571277289019225,
"calib/avg_num_step_conf": 1.49609375,
"calib/ece": 0.12769841269841273,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5992063492063492,
"calib/gap": 0.18189638318670576,
"calib/mean_conf": 0.8603174603174604,
"calib/mu_c": 0.9079569892473118,
"calib/mu_w": 0.7260606060606061,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.12496031746031748,
"calib/std_conf": 0.17724718865362435,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.9028679245283019,
"calib/step_q_c_n": 265.0,
"calib/step_q_gap": 0.13634250079948818,
"calib/step_q_w": 0.7665254237288137,
"calib/step_q_w_n": 118.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2414.0,
"completions/max_terminated_length": 2414.0,
"completions/mean_length": 379.8828125,
"completions/mean_terminated_length": 381.37255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.08,
"grad_norm": 0.006709123495966196,
"kl": 0.06363296508789062,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0133,
"num_tokens": 17246481.0,
"reward": 0.4585932791233063,
"reward_std": 0.15374954044818878,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.7987093925476074,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.21902284026145935,
"step": 75
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7128181336161188,
"calib/avg_num_step_conf": 1.6484375,
"calib/ece": 0.19640625000000012,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.53125,
"calib/gap": 0.13993107104984082,
"calib/mean_conf": 0.807578125,
"calib/mu_c": 0.8578658536585366,
"calib/mu_w": 0.7179347826086958,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18167968750000008,
"calib/std_conf": 0.2271485819270822,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8227916666666668,
"calib/step_q_c_n": 240.0,
"calib/step_q_gap": 0.14850595238095254,
"calib/step_q_w": 0.6742857142857143,
"calib/step_q_w_n": 182.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1130.0,
"completions/max_terminated_length": 1130.0,
"completions/mean_length": 403.10546875,
"completions/mean_terminated_length": 404.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.0064852419309318066,
"kl": 0.06375885009765625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0124,
"num_tokens": 17452732.0,
"reward": 0.38530316948890686,
"reward_std": 0.22022101283073425,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7547367215156555,
"rewards/format_reward_step": 1.0,
"rewards/step_l1_reward": -0.3122553825378418,
"step": 76
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6192792545902988,
"calib/avg_num_step_conf": 1.8515625,
"calib/ece": 0.2002371541501976,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4505928853754941,
"calib/gap": 0.09472663743491372,
"calib/mean_conf": 0.7932015810276679,
"calib/mu_c": 0.8265243902439026,
"calib/mu_w": 0.7317977528089888,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17260869565217385,
"calib/std_conf": 0.22309985285814607,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8138545454545455,
"calib/step_q_c_n": 275.0,
"calib/step_q_gap": 0.19908067610781177,
"calib/step_q_w": 0.6147738693467337,
"calib/step_q_w_n": 199.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2043.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 425.1484375,
"completions/mean_terminated_length": 426.8157043457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.006517153698951006,
"kl": 0.0614471435546875,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0051,
"num_tokens": 17666234.0,
"reward": 0.37873509526252747,
"reward_std": 0.2252240777015686,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7295206785202026,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.2962692677974701,
"step": 77
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6582236842105265,
"calib/avg_num_step_conf": 1.60546875,
"calib/ece": 0.19643137254901955,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5450980392156862,
"calib/gap": 0.16037828947368415,
"calib/mean_conf": 0.8103137254901961,
"calib/mu_c": 0.8700624999999998,
"calib/mu_w": 0.7096842105263157,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18964705882352934,
"calib/std_conf": 0.23157918987715614,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8385714285714285,
"calib/step_q_c_n": 238.0,
"calib/step_q_gap": 0.17065235342691976,
"calib/step_q_w": 0.6679190751445088,
"calib/step_q_w_n": 173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2206.0,
"completions/max_terminated_length": 2206.0,
"completions/mean_length": 451.640625,
"completions/mean_terminated_length": 451.640625,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0832,
"grad_norm": 0.0056913201697170734,
"kl": 0.05938720703125,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0458,
"num_tokens": 17889878.0,
"reward": 0.3906322121620178,
"reward_std": 0.22954101860523224,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7474750280380249,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.28964805603027344,
"step": 78
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7005917159763314,
"calib/avg_num_step_conf": 1.52734375,
"calib/ece": 0.16334645669291337,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.515748031496063,
"calib/gap": 0.15910824921684652,
"calib/mean_conf": 0.8051574803149606,
"calib/mu_c": 0.8584023668639053,
"calib/mu_w": 0.6992941176470587,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1515748031496063,
"calib/std_conf": 0.2309117814025555,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8237051792828686,
"calib/step_q_c_n": 251.0,
"calib/step_q_gap": 0.13899089356858274,
"calib/step_q_w": 0.6847142857142858,
"calib/step_q_w_n": 140.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1767.0,
"completions/max_terminated_length": 1767.0,
"completions/mean_length": 427.8359375,
"completions/mean_terminated_length": 429.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.005721926223486662,
"kl": 0.05712890625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0182,
"num_tokens": 18105780.0,
"reward": 0.4099389910697937,
"reward_std": 0.20847250521183014,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.769273042678833,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.2798638343811035,
"step": 79
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7394636015325671,
"calib/avg_num_step_conf": 1.46484375,
"calib/ece": 0.1962352941176469,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6235294117647059,
"calib/gap": 0.15399106002554308,
"calib/mean_conf": 0.8647058823529412,
"calib/mu_c": 0.9136206896551726,
"calib/mu_w": 0.7596296296296295,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18929411764705867,
"calib/std_conf": 0.19809636139205197,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8561632653061224,
"calib/step_q_c_n": 245.0,
"calib/step_q_gap": 0.1252401883830454,
"calib/step_q_w": 0.730923076923077,
"calib/step_q_w_n": 130.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1303.0,
"completions/max_terminated_length": 1303.0,
"completions/mean_length": 368.0234375,
"completions/mean_terminated_length": 369.4666748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.007674939930438995,
"kl": 0.07657623291015625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0131,
"num_tokens": 18302154.0,
"reward": 0.4051653742790222,
"reward_std": 0.23084446787834167,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7694988250732422,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.29276180267333984,
"step": 80
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7535815673070575,
"calib/avg_num_step_conf": 1.4453125,
"calib/ece": 0.198531746031746,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5476190476190477,
"calib/gap": 0.23549019607843125,
"calib/mean_conf": 0.7851984126984127,
"calib/mu_c": 0.8777124183006535,
"calib/mu_w": 0.6422222222222222,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1882936507936508,
"calib/std_conf": 0.27287337955051927,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8794565217391304,
"calib/step_q_c_n": 184.0,
"calib/step_q_gap": 0.24569308087891528,
"calib/step_q_w": 0.6337634408602151,
"calib/step_q_w_n": 186.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3034.0,
"completions/max_terminated_length": 3034.0,
"completions/mean_length": 430.89453125,
"completions/mean_terminated_length": 432.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.0864,
"grad_norm": 0.00645162258297205,
"kl": 0.06568145751953125,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0158,
"num_tokens": 18518711.0,
"reward": 0.39590904116630554,
"reward_std": 0.24231690168380737,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7513003349304199,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.2743260860443115,
"step": 81
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7049266247379455,
"calib/avg_num_step_conf": 1.7734375,
"calib/ece": 0.20907630522088347,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6224899598393574,
"calib/gap": 0.18125157232704392,
"calib/mean_conf": 0.8314056224899599,
"calib/mu_c": 0.8969182389937106,
"calib/mu_w": 0.7156666666666667,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20096385542168668,
"calib/std_conf": 0.2432742517711509,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8766331658291456,
"calib/step_q_c_n": 199.0,
"calib/step_q_gap": 0.39949591092718484,
"calib/step_q_w": 0.4771372549019608,
"calib/step_q_w_n": 255.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 393.23046875,
"completions/mean_terminated_length": 397.893310546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.006193524692207575,
"kl": 0.066192626953125,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0215,
"num_tokens": 18724930.0,
"reward": 0.38317593932151794,
"reward_std": 0.25840771198272705,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7358046770095825,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.28820282220840454,
"step": 82
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7366818873668187,
"calib/avg_num_step_conf": 1.34375,
"calib/ece": 0.27661417322834636,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6496062992125984,
"calib/gap": 0.19511288685946238,
"calib/mean_conf": 0.817244094488189,
"calib/mu_c": 0.9002054794520549,
"calib/mu_w": 0.7050925925925925,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25952755905511804,
"calib/std_conf": 0.27007341424008796,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8611235955056181,
"calib/step_q_c_n": 178.0,
"calib/step_q_gap": 0.16425612562610004,
"calib/step_q_w": 0.6968674698795181,
"calib/step_q_w_n": 166.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2342.0,
"completions/max_terminated_length": 2342.0,
"completions/mean_length": 458.265625,
"completions/mean_terminated_length": 458.265625,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.005535097327083349,
"kl": 0.06232452392578125,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0012,
"num_tokens": 18949510.0,
"reward": 0.3472113311290741,
"reward_std": 0.2140897512435913,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.704337477684021,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3208523690700531,
"step": 83
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7351,
"calib/avg_num_step_conf": 1.625,
"calib/ece": 0.22379999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.604,
"calib/gap": 0.2233999999999995,
"calib/mean_conf": 0.8156400000000001,
"calib/mu_c": 0.9049999999999996,
"calib/mu_w": 0.6816000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.21971999999999994,
"calib/std_conf": 0.24825750824496726,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8542105263157894,
"calib/step_q_c_n": 247.0,
"calib/step_q_gap": 0.1745655559015883,
"calib/step_q_w": 0.6796449704142011,
"calib/step_q_w_n": 169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 384.80859375,
"completions/mean_terminated_length": 387.8385925292969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.0896,
"grad_norm": 0.006911741103976965,
"kl": 0.0834808349609375,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0439,
"num_tokens": 19153941.0,
"reward": 0.37587296962738037,
"reward_std": 0.2167174518108368,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7413082122802734,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3020622432231903,
"step": 84
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7137548850022423,
"calib/avg_num_step_conf": 1.6796875,
"calib/ece": 0.28148000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.548,
"calib/gap": 0.20196489204945867,
"calib/mean_conf": 0.79628,
"calib/mu_c": 0.8940310077519381,
"calib/mu_w": 0.6920661157024794,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.28088,
"calib/std_conf": 0.2644060543936163,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8198295454545456,
"calib/step_q_c_n": 176.0,
"calib/step_q_gap": 0.18455395490336446,
"calib/step_q_w": 0.6352755905511811,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2637.0,
"completions/max_terminated_length": 2637.0,
"completions/mean_length": 428.75390625,
"completions/mean_terminated_length": 433.83795166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.006645851768553257,
"kl": 0.06461334228515625,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0305,
"num_tokens": 19371526.0,
"reward": 0.3220665454864502,
"reward_std": 0.23552703857421875,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6779835820198059,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3268192708492279,
"step": 85
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.734944,
"calib/avg_num_step_conf": 1.80859375,
"calib/ece": 0.23211999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.464,
"calib/gap": 0.2619999999999999,
"calib/mean_conf": 0.7321200000000001,
"calib/mu_c": 0.8631199999999999,
"calib/mu_w": 0.60112,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.23211999999999997,
"calib/std_conf": 0.29388825359309617,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8383815028901735,
"calib/step_q_c_n": 173.0,
"calib/step_q_gap": 0.17045046840741485,
"calib/step_q_w": 0.6679310344827587,
"calib/step_q_w_n": 290.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 433.67578125,
"completions/mean_terminated_length": 438.8182067871094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.006340604741126299,
"kl": 0.07331085205078125,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0278,
"num_tokens": 19588059.0,
"reward": 0.3461582064628601,
"reward_std": 0.222943514585495,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7213422060012817,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3212132453918457,
"step": 86
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6578105493133583,
"calib/avg_num_step_conf": 1.4296875,
"calib/ece": 0.15968,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.608,
"calib/gap": 0.1909269662921348,
"calib/mean_conf": 0.81344,
"calib/mu_c": 0.8684269662921348,
"calib/mu_w": 0.6775,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13055999999999998,
"calib/std_conf": 0.257904180656305,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8738202247191011,
"calib/step_q_c_n": 267.0,
"calib/step_q_gap": 0.15685052774940433,
"calib/step_q_w": 0.7169696969696968,
"calib/step_q_w_n": 99.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2918.0,
"completions/max_terminated_length": 2918.0,
"completions/mean_length": 390.3046875,
"completions/mean_terminated_length": 394.9328308105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.0928,
"grad_norm": 0.0069213323295116425,
"kl": 0.074493408203125,
"learning_rate": 3.138888888888889e-06,
"loss": 0.0473,
"num_tokens": 19793473.0,
"reward": 0.44103753566741943,
"reward_std": 0.209333136677742,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7777742147445679,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.23007416725158691,
"step": 87
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7320478723404256,
"calib/avg_num_step_conf": 1.54296875,
"calib/ece": 0.20110236220472455,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5866141732283464,
"calib/gap": 0.23726728723404245,
"calib/mean_conf": 0.8006299212598426,
"calib/mu_c": 0.8884375,
"calib/mu_w": 0.6511702127659575,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18590551181102377,
"calib/std_conf": 0.26281261259867017,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8605769230769231,
"calib/step_q_c_n": 208.0,
"calib/step_q_gap": 0.269721308103661,
"calib/step_q_w": 0.5908556149732621,
"calib/step_q_w_n": 187.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1747.0,
"completions/max_terminated_length": 1747.0,
"completions/mean_length": 425.328125,
"completions/mean_terminated_length": 426.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.010172465816140175,
"kl": 0.13602447509765625,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0003,
"num_tokens": 20012205.0,
"reward": 0.4115146994590759,
"reward_std": 0.22872042655944824,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7693734169006348,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.2690003216266632,
"step": 88
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7624516129032258,
"calib/avg_num_step_conf": 1.4375,
"calib/ece": 0.27738955823293177,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4979919678714859,
"calib/gap": 0.21976322580645136,
"calib/mean_conf": 0.7500803212851407,
"calib/mu_c": 0.8604032258064515,
"calib/mu_w": 0.6406400000000001,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.2647389558232932,
"calib/std_conf": 0.28767869176056377,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.766627906976744,
"calib/step_q_c_n": 172.0,
"calib/step_q_gap": 0.07586260085429508,
"calib/step_q_w": 0.6907653061224489,
"calib/step_q_w_n": 196.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2629.0,
"completions/max_terminated_length": 2629.0,
"completions/mean_length": 445.9765625,
"completions/mean_terminated_length": 449.4881896972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.006914828438311815,
"kl": 0.09096908569335938,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0262,
"num_tokens": 20235263.0,
"reward": 0.317240834236145,
"reward_std": 0.21951505541801453,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.683097243309021,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.33767807483673096,
"step": 89
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6417276210379659,
"calib/avg_num_step_conf": 1.35546875,
"calib/ece": 0.19519841269841268,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5833333333333334,
"calib/gap": 0.17187669801462913,
"calib/mean_conf": 0.7947222222222222,
"calib/mu_c": 0.8540606060606062,
"calib/mu_w": 0.6821839080459771,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16757936507936505,
"calib/std_conf": 0.2754367423383859,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8222072072072072,
"calib/step_q_c_n": 222.0,
"calib/step_q_gap": 0.15652720720720714,
"calib/step_q_w": 0.66568,
"calib/step_q_w_n": 125.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1354.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 394.7734375,
"completions/mean_terminated_length": 401.0397033691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.096,
"grad_norm": 0.005598283838480711,
"kl": 0.07326507568359375,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0349,
"num_tokens": 20439645.0,
"reward": 0.3961567282676697,
"reward_std": 0.22430838644504547,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7443863153457642,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.27785414457321167,
"step": 90
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6981297282151863,
"calib/avg_num_step_conf": 1.453125,
"calib/ece": 0.2087698412698412,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5873015873015873,
"calib/gap": 0.16699215466517248,
"calib/mean_conf": 0.8001190476190476,
"calib/mu_c": 0.85710843373494,
"calib/mu_w": 0.6901162790697675,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.175079365079365,
"calib/std_conf": 0.26773370171499444,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8462385321100918,
"calib/step_q_c_n": 218.0,
"calib/step_q_gap": 0.20649827236983198,
"calib/step_q_w": 0.6397402597402598,
"calib/step_q_w_n": 154.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1892.0,
"completions/max_terminated_length": 1892.0,
"completions/mean_length": 417.90234375,
"completions/mean_terminated_length": 421.1929016113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.005817817524075508,
"kl": 0.077728271484375,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.014,
"num_tokens": 20654340.0,
"reward": 0.39327478408813477,
"reward_std": 0.22805273532867432,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.739641010761261,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.27731022238731384,
"step": 91
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6651152043308907,
"calib/avg_num_step_conf": 1.33984375,
"calib/ece": 0.2482936507936508,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.626984126984127,
"calib/gap": 0.14431966726084378,
"calib/mean_conf": 0.8218650793650795,
"calib/mu_c": 0.878562091503268,
"calib/mu_w": 0.7342424242424243,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2315079365079365,
"calib/std_conf": 0.2564255470141414,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8427638190954773,
"calib/step_q_c_n": 199.0,
"calib/step_q_gap": 0.1665832635399217,
"calib/step_q_w": 0.6761805555555556,
"calib/step_q_w_n": 144.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1250.0,
"completions/max_terminated_length": 1250.0,
"completions/mean_length": 373.69921875,
"completions/mean_terminated_length": 378.1304626464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.0057909623719751835,
"kl": 0.08457183837890625,
"learning_rate": 3e-06,
"loss": -0.0316,
"num_tokens": 20856727.0,
"reward": 0.36944615840911865,
"reward_std": 0.26509976387023926,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6994296908378601,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.2753811180591583,
"step": 92
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7377803335250144,
"calib/avg_num_step_conf": 1.4765625,
"calib/ece": 0.20912698412698408,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.49603174603174605,
"calib/gap": 0.2634962622196666,
"calib/mean_conf": 0.7483333333333334,
"calib/mu_c": 0.8643971631205674,
"calib/mu_w": 0.6009009009009008,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1989682539682539,
"calib/std_conf": 0.301340260139518,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8316836734693878,
"calib/step_q_c_n": 196.0,
"calib/step_q_gap": 0.1358045525902668,
"calib/step_q_w": 0.695879120879121,
"calib/step_q_w_n": 182.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2485.0,
"completions/max_terminated_length": 2485.0,
"completions/mean_length": 417.0625,
"completions/mean_terminated_length": 418.69805908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.0992,
"grad_norm": 0.005857251584529877,
"kl": 0.077850341796875,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0395,
"num_tokens": 21069271.0,
"reward": 0.368843674659729,
"reward_std": 0.2849258780479431,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7412355542182922,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3097981810569763,
"step": 93
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7032701111837802,
"calib/avg_num_step_conf": 1.5234375,
"calib/ece": 0.22566265060240964,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4979919678714859,
"calib/gap": 0.21499542184434284,
"calib/mean_conf": 0.7471084337349396,
"calib/mu_c": 0.8420863309352519,
"calib/mu_w": 0.627090909090909,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2072690763052209,
"calib/std_conf": 0.28842821573431987,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8143243243243242,
"calib/step_q_c_n": 185.0,
"calib/step_q_gap": 0.22725115359261694,
"calib/step_q_w": 0.5870731707317073,
"calib/step_q_w_n": 205.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 433.34765625,
"completions/mean_terminated_length": 433.34765625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.005978456698358059,
"kl": 0.07855224609375,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0582,
"num_tokens": 21288888.0,
"reward": 0.35172322392463684,
"reward_std": 0.24280011653900146,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7168351411819458,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.315732479095459,
"step": 94
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7224184782608696,
"calib/avg_num_step_conf": 1.2734375,
"calib/ece": 0.1770634920634921,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5912698412698413,
"calib/gap": 0.23824999999999996,
"calib/mean_conf": 0.7662698412698412,
"calib/mu_c": 0.8532500000000001,
"calib/mu_w": 0.6150000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15420634920634924,
"calib/std_conf": 0.3048164530382701,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8723756906077347,
"calib/step_q_c_n": 181.0,
"calib/step_q_gap": 0.2783756906077347,
"calib/step_q_w": 0.594,
"calib/step_q_w_n": 145.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 388.80859375,
"completions/mean_terminated_length": 393.4189758300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.005680699832737446,
"kl": 0.0786590576171875,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0023,
"num_tokens": 21494551.0,
"reward": 0.4002438187599182,
"reward_std": 0.2253013551235199,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.749567985534668,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.2686115801334381,
"step": 95
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8099490795142968,
"calib/avg_num_step_conf": 1.375,
"calib/ece": 0.12677165354330705,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6417322834645669,
"calib/gap": 0.26587230708969833,
"calib/mean_conf": 0.8439370078740158,
"calib/mu_c": 0.9161621621621622,
"calib/mu_w": 0.6502898550724638,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1211811023622047,
"calib/std_conf": 0.23293964437543868,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8667782426778241,
"calib/step_q_c_n": 239.0,
"calib/step_q_gap": 0.24863664975747013,
"calib/step_q_w": 0.618141592920354,
"calib/step_q_w_n": 113.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 362.4140625,
"completions/mean_terminated_length": 363.8353271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.1024,
"grad_norm": 0.006683533079922199,
"kl": 0.0844573974609375,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0758,
"num_tokens": 21693145.0,
"reward": 0.48853859305381775,
"reward_std": 0.20125803351402283,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.8290726542472839,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.1934017539024353,
"step": 96
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6631067961165049,
"calib/avg_num_step_conf": 1.5859375,
"calib/ece": 0.2184584980237153,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5177865612648221,
"calib/gap": 0.17730355987055002,
"calib/mean_conf": 0.7696837944664032,
"calib/mu_c": 0.8418666666666665,
"calib/mu_w": 0.6645631067961165,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.19762845849802363,
"calib/std_conf": 0.28143319473200823,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.816930693069307,
"calib/step_q_c_n": 202.0,
"calib/step_q_gap": 0.19962677150067953,
"calib/step_q_w": 0.6173039215686275,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1005.0,
"completions/max_terminated_length": 1005.0,
"completions/mean_length": 386.8203125,
"completions/mean_terminated_length": 389.86614990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.10346666666666667,
"grad_norm": 1.6125926971435547,
"kl": 17.58576202392578,
"learning_rate": 2.861111111111111e-06,
"loss": 0.1785,
"num_tokens": 21897243.0,
"reward": 0.35906365513801575,
"reward_std": 0.26138195395469666,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.717347264289856,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3125011920928955,
"step": 97
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6870081365879686,
"calib/avg_num_step_conf": 1.421875,
"calib/ece": 0.2273092369477912,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5502008032128514,
"calib/gap": 0.20064625850340145,
"calib/mean_conf": 0.7567871485943775,
"calib/mu_c": 0.8389795918367348,
"calib/mu_w": 0.6383333333333333,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1968674698795181,
"calib/std_conf": 0.3095405957174121,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8312499999999999,
"calib/step_q_c_n": 192.0,
"calib/step_q_gap": 0.20409883720930222,
"calib/step_q_w": 0.6271511627906977,
"calib/step_q_w_n": 172.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2810.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 436.0,
"completions/mean_terminated_length": 439.4330749511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.005526782479137182,
"kl": 0.09055328369140625,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0457,
"num_tokens": 22115043.0,
"reward": 0.36131489276885986,
"reward_std": 0.2622639238834381,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7069636583328247,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.29214632511138916,
"step": 98
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6965757317419722,
"calib/avg_num_step_conf": 1.69140625,
"calib/ece": 0.27004081632653065,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.34285714285714286,
"calib/gap": 0.214514776925263,
"calib/mean_conf": 0.6037551020408163,
"calib/mu_c": 0.737717391304348,
"calib/mu_w": 0.523202614379085,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.24914285714285717,
"calib/std_conf": 0.34016529779051236,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6817730496453901,
"calib/step_q_c_n": 141.0,
"calib/step_q_gap": 0.15404907704265047,
"calib/step_q_w": 0.5277239726027396,
"calib/step_q_w_n": 292.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2496.0,
"completions/max_terminated_length": 2496.0,
"completions/mean_length": 465.58203125,
"completions/mean_terminated_length": 476.7560119628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.1056,
"grad_norm": 0.005705671850591898,
"kl": 0.11505126953125,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0724,
"num_tokens": 22340032.0,
"reward": 0.2724548578262329,
"reward_std": 0.2788415849208832,
"rewards/accuracy_reward_step": 0.359375,
"rewards/final_brier_reward_step": 0.6610808372497559,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.37788987159729004,
"step": 99
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7803045090767129,
"calib/avg_num_step_conf": 1.73046875,
"calib/ece": 0.14891999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.408,
"calib/gap": 0.31491769145682863,
"calib/mean_conf": 0.6753199999999999,
"calib/mu_c": 0.8126241134751773,
"calib/mu_w": 0.49770642201834864,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.13011999999999996,
"calib/std_conf": 0.32655182988309833,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7673272727272726,
"calib/step_q_c_n": 220.0,
"calib/step_q_gap": 0.28342592743579276,
"calib/step_q_w": 0.4839013452914799,
"calib/step_q_w_n": 223.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2049.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 462.1953125,
"completions/mean_terminated_length": 464.00787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.005212652962654829,
"kl": 0.13134765625,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0628,
"num_tokens": 22565762.0,
"reward": 0.3926239013671875,
"reward_std": 0.23804575204849243,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7540972828865051,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.27041196823120117,
"step": 100
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6799299960375115,
"calib/avg_num_step_conf": 1.7890625,
"calib/ece": 0.25408906882591087,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.3562753036437247,
"calib/gap": 0.1919475630696077,
"calib/mean_conf": 0.654008097165992,
"calib/mu_c": 0.758141592920354,
"calib/mu_w": 0.5661940298507463,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.22530364372469633,
"calib/std_conf": 0.33032514949346076,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6903804347826087,
"calib/step_q_c_n": 184.0,
"calib/step_q_gap": 0.1653676610599809,
"calib/step_q_w": 0.5250127737226278,
"calib/step_q_w_n": 274.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 464.21484375,
"completions/mean_terminated_length": 469.7193908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.005432981997728348,
"kl": 0.164337158203125,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0066,
"num_tokens": 22791593.0,
"reward": 0.29547733068466187,
"reward_std": 0.2509266138076782,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.6622054576873779,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.3485945761203766,
"step": 101
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.760089098532495,
"calib/avg_num_step_conf": 1.609375,
"calib/ece": 0.16680000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.424,
"calib/gap": 0.30636399371069184,
"calib/mean_conf": 0.6665599999999999,
"calib/mu_c": 0.7964583333333334,
"calib/mu_w": 0.49009433962264154,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.12868000000000007,
"calib/std_conf": 0.3397554508760676,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7313106796116505,
"calib/step_q_c_n": 206.0,
"calib/step_q_gap": 0.21928300970873793,
"calib/step_q_w": 0.5120276699029126,
"calib/step_q_w_n": 206.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2606.0,
"completions/max_terminated_length": 2606.0,
"completions/mean_length": 379.53515625,
"completions/mean_terminated_length": 382.52362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.1088,
"grad_norm": 0.0060148392803967,
"kl": 0.206390380859375,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0235,
"num_tokens": 22995450.0,
"reward": 0.3910207748413086,
"reward_std": 0.21820616722106934,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7527488470077515,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.2753947973251343,
"step": 102
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7216002747252748,
"calib/avg_num_step_conf": 1.58984375,
"calib/ece": 0.15918032786885244,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.39344262295081966,
"calib/gap": 0.27140934065934064,
"calib/mean_conf": 0.6452459016393443,
"calib/mu_c": 0.7609285714285714,
"calib/mu_w": 0.48951923076923076,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.115327868852459,
"calib/std_conf": 0.349504832666122,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.7487878787878788,
"calib/step_q_c_n": 198.0,
"calib/step_q_gap": 0.2991505582137161,
"calib/step_q_w": 0.44963732057416267,
"calib/step_q_w_n": 209.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3008.0,
"completions/max_terminated_length": 3008.0,
"completions/mean_length": 496.28125,
"completions/mean_terminated_length": 504.15875244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.004697397816926241,
"kl": 0.180694580078125,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0238,
"num_tokens": 23227050.0,
"reward": 0.3566873073577881,
"reward_std": 0.24476046860218048,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6928308606147766,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": -0.2700812518596649,
"step": 103
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6650138696255201,
"calib/avg_num_step_conf": 1.84375,
"calib/ece": 0.25530864197530867,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.3168724279835391,
"calib/gap": 0.19997642163661578,
"calib/mean_conf": 0.5960493827160492,
"calib/mu_c": 0.71126213592233,
"calib/mu_w": 0.5112857142857142,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.2137448559670782,
"calib/std_conf": 0.3493545336101135,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.7314465753424658,
"calib/step_q_c_n": 146.0,
"calib/step_q_gap": 0.2991766366921591,
"calib/step_q_w": 0.4322699386503067,
"calib/step_q_w_n": 326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2888.0,
"completions/max_terminated_length": 2888.0,
"completions/mean_length": 444.640625,
"completions/mean_terminated_length": 451.69842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.005236889701336622,
"kl": 0.1962127685546875,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0415,
"num_tokens": 23447558.0,
"reward": 0.2982354760169983,
"reward_std": 0.2744593024253845,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.6419824361801147,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": -0.30957403779029846,
"step": 104
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6850935828877006,
"calib/avg_num_step_conf": 1.6875,
"calib/ece": 0.23959349593495938,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.34552845528455284,
"calib/gap": 0.23148128342245977,
"calib/mean_conf": 0.604390243902439,
"calib/mu_c": 0.7323636363636362,
"calib/mu_w": 0.5008823529411764,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.19841463414634147,
"calib/std_conf": 0.3640862185512724,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.726094674556213,
"calib/step_q_c_n": 169.0,
"calib/step_q_gap": 0.19453573919499623,
"calib/step_q_w": 0.5315589353612168,
"calib/step_q_w_n": 263.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2267.0,
"completions/max_terminated_length": 2267.0,
"completions/mean_length": 430.09375,
"completions/mean_terminated_length": 436.920654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.112,
"grad_norm": 0.0060654194094240665,
"kl": 0.250762939453125,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0077,
"num_tokens": 23663422.0,
"reward": 0.2967536449432373,
"reward_std": 0.26435887813568115,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6521124839782715,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l1_reward": -0.32813647389411926,
"step": 105
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6703801652892561,
"calib/avg_num_step_conf": 1.375,
"calib/ece": 0.22585772357723571,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.34146341463414637,
"calib/gap": 0.20697381818181804,
"calib/mean_conf": 0.626987804878049,
"calib/mu_c": 0.7287919999999999,
"calib/mu_w": 0.5218181818181818,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.17235772357723572,
"calib/std_conf": 0.3483230964579807,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.7886063063063062,
"calib/step_q_c_n": 148.0,
"calib/step_q_gap": 0.19341954160042385,
"calib/step_q_w": 0.5951867647058824,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1294.0,
"completions/max_terminated_length": 1294.0,
"completions/mean_length": 405.99609375,
"completions/mean_terminated_length": 412.44049072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.005529459100216627,
"kl": 0.234954833984375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0274,
"num_tokens": 23871941.0,
"reward": 0.30446913838386536,
"reward_std": 0.2846444845199585,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6475687623023987,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": -0.3183179795742035,
"step": 106
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5851772860570515,
"calib/avg_num_step_conf": 1.5625,
"calib/ece": 0.293224081632653,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.39183673469387753,
"calib/gap": 0.11109621434284223,
"calib/mean_conf": 0.6775514285714285,
"calib/mu_c": 0.7324193548387098,
"calib/mu_w": 0.6213231404958676,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.23232653061224487,
"calib/std_conf": 0.3371138069324509,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.7262352941176471,
"calib/step_q_c_n": 170.0,
"calib/step_q_gap": 0.17114746803069047,
"calib/step_q_w": 0.5550878260869566,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2541.0,
"completions/max_terminated_length": 2541.0,
"completions/mean_length": 419.98828125,
"completions/mean_terminated_length": 421.63531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.005133834667503834,
"kl": 0.25164794921875,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0226,
"num_tokens": 24084074.0,
"reward": 0.2446097731590271,
"reward_std": 0.29522377252578735,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5793129205703735,
"rewards/format_reward_step": 0.890625,
"rewards/step_l1_reward": -0.3650933504104614,
"step": 107
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6171189692982456,
"calib/avg_num_step_conf": 1.140625,
"calib/ece": 0.2592633064516129,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.4435483870967742,
"calib/gap": 0.13494758771929827,
"calib/mean_conf": 0.6685431451612904,
"calib/mu_c": 0.7207809210526316,
"calib/mu_w": 0.5858333333333333,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.15745161290322582,
"calib/std_conf": 0.35744971521127844,
"calib/step_conf_rate": 0.90625,
"calib/step_q_c": 0.768778488372093,
"calib/step_q_c_n": 172.0,
"calib/step_q_gap": 0.12694515503875958,
"calib/step_q_w": 0.6418333333333334,
"calib/step_q_w_n": 120.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1439.0,
"completions/max_terminated_length": 1439.0,
"completions/mean_length": 434.44921875,
"completions/mean_terminated_length": 436.1529541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.1152,
"grad_norm": 0.004808885511010885,
"kl": 0.228851318359375,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0153,
"num_tokens": 24298525.0,
"reward": 0.3085951805114746,
"reward_std": 0.3162803053855896,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6369038224220276,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": -0.3197134733200073,
"step": 108
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7463654891304348,
"calib/avg_num_step_conf": 1.56640625,
"calib/ece": 0.1926337448559671,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.35390946502057613,
"calib/gap": 0.2971317934782609,
"calib/mean_conf": 0.6113991769547326,
"calib/mu_c": 0.7679130434782608,
"calib/mu_w": 0.47078124999999993,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.16539094650205763,
"calib/std_conf": 0.3559796329763443,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.717907100591716,
"calib/step_q_c_n": 169.0,
"calib/step_q_gap": 0.19427347990206079,
"calib/step_q_w": 0.5236336206896552,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1530.0,
"completions/max_terminated_length": 1530.0,
"completions/mean_length": 419.140625,
"completions/mean_terminated_length": 427.49005126953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.005312109831720591,
"kl": 0.236328125,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0321,
"num_tokens": 24510425.0,
"reward": 0.3231509327888489,
"reward_std": 0.2573995888233185,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6752394437789917,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": -0.30003127455711365,
"step": 109
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.653310684785505,
"calib/avg_num_step_conf": 1.625,
"calib/ece": 0.2919429149797571,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.3724696356275304,
"calib/gap": 0.17001091793232093,
"calib/mean_conf": 0.6540489878542509,
"calib/mu_c": 0.7497231481481482,
"calib/mu_w": 0.5797122302158273,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.25437246963562754,
"calib/std_conf": 0.34342655702467995,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.7162337662337663,
"calib/step_q_c_n": 154.0,
"calib/step_q_gap": 0.1783955982948351,
"calib/step_q_w": 0.5378381679389312,
"calib/step_q_w_n": 262.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2917.0,
"completions/max_terminated_length": 2917.0,
"completions/mean_length": 429.60546875,
"completions/mean_terminated_length": 436.42462158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.004588097333908081,
"kl": 0.236175537109375,
"learning_rate": 2.5e-06,
"loss": -0.0414,
"num_tokens": 24725324.0,
"reward": 0.2684740722179413,
"reward_std": 0.2740425765514374,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.6149437427520752,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": -0.3444017767906189,
"step": 110
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.6699943757030371,
"calib/avg_num_step_conf": 1.4921875,
"calib/ece": 0.26048814504881457,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.4560669456066946,
"calib/gap": 0.1721283277090362,
"calib/mean_conf": 0.6770013947001395,
"calib/mu_c": 0.7576640419947505,
"calib/mu_w": 0.5855357142857143,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.20305439330543942,
"calib/std_conf": 0.34518280147230895,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.7737504873294346,
"calib/step_q_c_n": 171.0,
"calib/step_q_gap": 0.23127465794554825,
"calib/step_q_w": 0.5424758293838864,
"calib/step_q_w_n": 211.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2612.0,
"completions/max_terminated_length": 2612.0,
"completions/mean_length": 450.96484375,
"completions/mean_terminated_length": 454.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.1184,
"grad_norm": 0.005634634755551815,
"kl": 0.24365234375,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0245,
"num_tokens": 24948179.0,
"reward": 0.3159874975681305,
"reward_std": 0.2590514123439789,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6415572762489319,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": -0.292394757270813,
"step": 111
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.7191492450638793,
"calib/avg_num_step_conf": 1.4609375,
"calib/ece": 0.18625531914893617,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.87109375,
"calib/frac_conf_gt_0.9": 0.3404255319148936,
"calib/gap": 0.29017857142857145,
"calib/mean_conf": 0.5682978723404256,
"calib/mu_c": 0.7201785714285714,
"calib/mu_w": 0.43,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.13897872340425535,
"calib/std_conf": 0.3726427708628253,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.686558024691358,
"calib/step_q_c_n": 162.0,
"calib/step_q_gap": 0.23490708129513155,
"calib/step_q_w": 0.4516509433962264,
"calib/step_q_w_n": 212.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 464.984375,
"completions/mean_terminated_length": 474.2470397949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.005269154440611601,
"kl": 0.239166259765625,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0021,
"num_tokens": 25175135.0,
"reward": 0.31370168924331665,
"reward_std": 0.27312415838241577,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6559332013130188,
"rewards/format_reward_step": 0.87109375,
"rewards/step_l1_reward": -0.2902485728263855,
"step": 112
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7051602564102564,
"calib/avg_num_step_conf": 1.6328125,
"calib/ece": 0.24623999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.376,
"calib/gap": 0.23257692307692324,
"calib/mean_conf": 0.64656,
"calib/mu_c": 0.7675000000000001,
"calib/mu_w": 0.5349230769230768,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.20639999999999997,
"calib/std_conf": 0.3435057006804982,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.766530612244898,
"calib/step_q_c_n": 147.0,
"calib/step_q_gap": 0.2937234781735574,
"calib/step_q_w": 0.4728071340713407,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 386.9375,
"completions/mean_terminated_length": 389.9842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.006095048971474171,
"kl": 0.2994384765625,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0163,
"num_tokens": 25379391.0,
"reward": 0.30809134244918823,
"reward_std": 0.28997525572776794,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6653277277946472,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": -0.32805129885673523,
"step": 113
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6874215590573142,
"calib/avg_num_step_conf": 1.52734375,
"calib/ece": 0.1843621399176955,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.3292181069958848,
"calib/gap": 0.23749686236229256,
"calib/mean_conf": 0.6245267489711934,
"calib/mu_c": 0.7232394366197183,
"calib/mu_w": 0.4857425742574258,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.11226337448559671,
"calib/std_conf": 0.3492250125159953,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.7372666666666667,
"calib/step_q_c_n": 200.0,
"calib/step_q_gap": 0.26213577661431064,
"calib/step_q_w": 0.4751308900523561,
"calib/step_q_w_n": 191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2904.0,
"completions/max_terminated_length": 2904.0,
"completions/mean_length": 397.74609375,
"completions/mean_terminated_length": 400.8779602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.1216,
"grad_norm": 0.005061530973762274,
"kl": 0.26641845703125,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0279,
"num_tokens": 25586238.0,
"reward": 0.3437195420265198,
"reward_std": 0.26500290632247925,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.678789496421814,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": -0.28431904315948486,
"step": 114
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6103429010405755,
"calib/avg_num_step_conf": 1.53515625,
"calib/ece": 0.29062499999999997,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.2875,
"calib/gap": 0.1263461135554158,
"calib/mean_conf": 0.5681250000000001,
"calib/mu_c": 0.636036036036036,
"calib/mu_w": 0.5096899224806202,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.198125,
"calib/std_conf": 0.3588108615621885,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.6452280701754386,
"calib/step_q_c_n": 171.0,
"calib/step_q_gap": 0.09575014224751077,
"calib/step_q_w": 0.5494779279279278,
"calib/step_q_w_n": 222.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3008.0,
"completions/max_terminated_length": 3008.0,
"completions/mean_length": 402.4609375,
"completions/mean_terminated_length": 405.6299133300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.00514686293900013,
"kl": 0.2657623291015625,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0078,
"num_tokens": 25794532.0,
"reward": 0.2529529333114624,
"reward_std": 0.2864396572113037,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.5985375046730042,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l1_reward": -0.3598191440105438,
"step": 115
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6419,
"calib/avg_num_step_conf": 1.75,
"calib/ece": 0.2500408163265306,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3183673469387755,
"calib/gap": 0.17293999999999998,
"calib/mean_conf": 0.587265306122449,
"calib/mu_c": 0.6755,
"calib/mu_w": 0.50256,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.1737551020408163,
"calib/std_conf": 0.3643437923176632,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.6730946502057613,
"calib/step_q_c_n": 162.0,
"calib/step_q_gap": 0.26321745722330525,
"calib/step_q_w": 0.40987719298245606,
"calib/step_q_w_n": 285.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1813.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 438.4296875,
"completions/mean_terminated_length": 440.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.005122186616063118,
"kl": 0.2557373046875,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0139,
"num_tokens": 26011290.0,
"reward": 0.27786949276924133,
"reward_std": 0.28764808177948,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.611339807510376,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l1_reward": -0.32825711369514465,
"step": 116
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.6506338028169015,
"calib/avg_num_step_conf": 1.6171875,
"calib/ece": 0.291900826446281,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.359504132231405,
"calib/gap": 0.15911830985915476,
"calib/mean_conf": 0.6360330578512396,
"calib/mu_c": 0.7293999999999998,
"calib/mu_w": 0.5702816901408451,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.25735537190082647,
"calib/std_conf": 0.3401676284364873,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.6961691176470588,
"calib/step_q_c_n": 136.0,
"calib/step_q_gap": 0.16289573635209476,
"calib/step_q_w": 0.533273381294964,
"calib/step_q_w_n": 278.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1682.0,
"completions/max_terminated_length": 1682.0,
"completions/mean_length": 402.79296875,
"completions/mean_terminated_length": 407.5691833496094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.1248,
"grad_norm": 0.005389553029090166,
"kl": 0.259979248046875,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0049,
"num_tokens": 26221005.0,
"reward": 0.2536861300468445,
"reward_std": 0.30577370524406433,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.6082687377929688,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l1_reward": -0.36183398962020874,
"step": 117
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6660320378151261,
"calib/avg_num_step_conf": 2.43359375,
"calib/ece": 0.2161943319838056,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.2793522267206478,
"calib/gap": 0.18538602941176457,
"calib/mean_conf": 0.577246963562753,
"calib/mu_c": 0.6665625,
"calib/mu_w": 0.48117647058823537,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.13761133603238862,
"calib/std_conf": 0.3518414711064845,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6069601769911505,
"calib/step_q_c_n": 226.0,
"calib/step_q_gap": 0.2586912433219649,
"calib/step_q_w": 0.34826893366918554,
"calib/step_q_w_n": 397.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 407.22265625,
"completions/mean_terminated_length": 413.6865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.005140895489603281,
"kl": 0.269775390625,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0047,
"num_tokens": 26429262.0,
"reward": 0.28935784101486206,
"reward_std": 0.2676890194416046,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6488054990768433,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l1_reward": -0.35290229320526123,
"step": 118
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.6801426452934116,
"calib/avg_num_step_conf": 1.515625,
"calib/ece": 0.22511204481792713,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3025210084033613,
"calib/gap": 0.21985994397759107,
"calib/mean_conf": 0.5703501400560225,
"calib/mu_c": 0.680280112044818,
"calib/mu_w": 0.4604201680672269,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.14773109243697474,
"calib/std_conf": 0.35998472150569677,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.6668528735632184,
"calib/step_q_c_n": 174.0,
"calib/step_q_gap": 0.204742125899667,
"calib/step_q_w": 0.4621107476635514,
"calib/step_q_w_n": 214.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1673.0,
"completions/max_terminated_length": 1673.0,
"completions/mean_length": 443.99609375,
"completions/mean_terminated_length": 447.49212646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.0047919717617332935,
"kl": 0.262969970703125,
"learning_rate": 2.25e-06,
"loss": 0.0135,
"num_tokens": 26647989.0,
"reward": 0.30583804845809937,
"reward_std": 0.27502328157424927,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6427181959152222,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l1_reward": -0.3036983609199524,
"step": 119
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6918258160639307,
"calib/avg_num_step_conf": 1.45703125,
"calib/ece": 0.1816734693877551,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.2938775510204082,
"calib/gap": 0.21975145604767699,
"calib/mean_conf": 0.5871428571428571,
"calib/mu_c": 0.6831159420289854,
"calib/mu_w": 0.4633644859813084,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.10277551020408163,
"calib/std_conf": 0.34212302358260854,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.649336569579288,
"calib/step_q_c_n": 206.0,
"calib/step_q_gap": 0.11979565141561532,
"calib/step_q_w": 0.5295409181636727,
"calib/step_q_w_n": 167.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1621.0,
"completions/max_terminated_length": 1621.0,
"completions/mean_length": 395.3359375,
"completions/mean_terminated_length": 398.4488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.128,
"grad_norm": 0.0057893842458724976,
"kl": 0.268463134765625,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0043,
"num_tokens": 26855883.0,
"reward": 0.3432433009147644,
"reward_std": 0.26072174310684204,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.698214054107666,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": -0.3070399761199951,
"step": 120
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5701693981545242,
"calib/avg_num_step_conf": 1.8203125,
"calib/ece": 0.26732510288065847,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.25102880658436216,
"calib/gap": 0.09005853188266078,
"calib/mean_conf": 0.5504526748971194,
"calib/mu_c": 0.6012264150943396,
"calib/mu_w": 0.5111678832116788,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.19078189300411524,
"calib/std_conf": 0.3515295807101361,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6165358024691358,
"calib/step_q_c_n": 162.0,
"calib/step_q_gap": 0.20650257878492517,
"calib/step_q_w": 0.4100332236842106,
"calib/step_q_w_n": 304.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2897.0,
"completions/max_terminated_length": 2897.0,
"completions/mean_length": 464.125,
"completions/mean_terminated_length": 475.2640075683594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.005195781122893095,
"kl": 0.25836181640625,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0109,
"num_tokens": 27079755.0,
"reward": 0.25892913341522217,
"reward_std": 0.3152400851249695,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/final_brier_reward_step": 0.6206378936767578,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": -0.37231090664863586,
"step": 121
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6745995273109243,
"calib/avg_num_step_conf": 1.6640625,
"calib/ece": 0.22302699055330633,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.2793522267206478,
"calib/gap": 0.20782245710784325,
"calib/mean_conf": 0.5758151147098516,
"calib/mu_c": 0.6759401041666667,
"calib/mu_w": 0.46811764705882347,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14031174089068826,
"calib/std_conf": 0.3581826466658249,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.69139896373057,
"calib/step_q_c_n": 193.0,
"calib/step_q_gap": 0.1968727834730592,
"calib/step_q_w": 0.4945261802575108,
"calib/step_q_w_n": 233.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1538.0,
"completions/max_terminated_length": 1538.0,
"completions/mean_length": 381.82421875,
"completions/mean_terminated_length": 386.351806640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.005926921498030424,
"kl": 0.29742431640625,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0226,
"num_tokens": 27284846.0,
"reward": 0.3194349706172943,
"reward_std": 0.2801957130432129,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6791479587554932,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": -0.3285592198371887,
"step": 122
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6145352900069883,
"calib/avg_num_step_conf": 1.8671875,
"calib/ece": 0.25282157676348554,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.2157676348547718,
"calib/gap": 0.13559678546470993,
"calib/mean_conf": 0.5052697095435685,
"calib/mu_c": 0.5812264150943396,
"calib/mu_w": 0.44562962962962965,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.15912863070539424,
"calib/std_conf": 0.3547578653453605,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.5037560975609756,
"calib/step_q_c_n": 205.0,
"calib/step_q_gap": 0.04045939426427231,
"calib/step_q_w": 0.4632967032967033,
"calib/step_q_w_n": 273.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1970.0,
"completions/max_terminated_length": 1970.0,
"completions/mean_length": 478.703125,
"completions/mean_terminated_length": 480.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.1312,
"grad_norm": 0.004890909418463707,
"kl": 0.2481689453125,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0048,
"num_tokens": 27512682.0,
"reward": 0.2750769853591919,
"reward_std": 0.25875532627105713,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6226316690444946,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l1_reward": -0.3349777162075043,
"step": 123
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6332996292551397,
"calib/avg_num_step_conf": 1.59765625,
"calib/ece": 0.23532786885245904,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.29508196721311475,
"calib/gap": 0.1512854735422986,
"calib/mean_conf": 0.5888524590163935,
"calib/mu_c": 0.66015503875969,
"calib/mu_w": 0.5088695652173914,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.14774590163934428,
"calib/std_conf": 0.3425404049909007,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6702487562189056,
"calib/step_q_c_n": 201.0,
"calib/step_q_gap": 0.14543625621890555,
"calib/step_q_w": 0.5248125,
"calib/step_q_w_n": 208.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2125.0,
"completions/max_terminated_length": 2125.0,
"completions/mean_length": 419.734375,
"completions/mean_terminated_length": 423.03936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.005201234016567469,
"kl": 0.279571533203125,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0035,
"num_tokens": 27726950.0,
"reward": 0.3044394850730896,
"reward_std": 0.2931825518608093,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6657023429870605,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": -0.3443233370780945,
"step": 124
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6598881900768693,
"calib/avg_num_step_conf": 1.828125,
"calib/ece": 0.21829875518672198,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.23651452282157676,
"calib/gap": 0.19087002096436073,
"calib/mean_conf": 0.5350622406639005,
"calib/mu_c": 0.6419811320754718,
"calib/mu_w": 0.4511111111111111,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.15676348547717844,
"calib/std_conf": 0.3517426920879851,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.5855860215053763,
"calib/step_q_c_n": 186.0,
"calib/step_q_gap": 0.10646793639899332,
"calib/step_q_w": 0.479118085106383,
"calib/step_q_w_n": 282.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1630.0,
"completions/max_terminated_length": 1630.0,
"completions/mean_length": 439.90625,
"completions/mean_terminated_length": 445.12255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.00522991269826889,
"kl": 0.262451171875,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0011,
"num_tokens": 27944374.0,
"reward": 0.26843976974487305,
"reward_std": 0.2542046904563904,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6382890939712524,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l1_reward": -0.3639094829559326,
"step": 125
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.6901683161164487,
"calib/avg_num_step_conf": 2.25390625,
"calib/ece": 0.2394915254237288,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3135593220338983,
"calib/gap": 0.2394957740374195,
"calib/mean_conf": 0.5553389830508475,
"calib/mu_c": 0.6842201834862384,
"calib/mu_w": 0.44472440944881886,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.16648305084745763,
"calib/std_conf": 0.3718521717468706,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.6838181818181818,
"calib/step_q_c_n": 165.0,
"calib/step_q_gap": 0.3663399455722271,
"calib/step_q_w": 0.31747823624595467,
"calib/step_q_w_n": 412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 466.31640625,
"completions/mean_terminated_length": 477.5080261230469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.1344,
"grad_norm": 0.004744419362396002,
"kl": 0.2213134765625,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0029,
"num_tokens": 28169215.0,
"reward": 0.2959335148334503,
"reward_std": 0.26156705617904663,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.6451636552810669,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l1_reward": -0.31735917925834656,
"step": 126
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.668756967670011,
"calib/avg_num_step_conf": 2.25390625,
"calib/ece": 0.23289214876033065,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.2727272727272727,
"calib/gap": 0.20829358974358975,
"calib/mean_conf": 0.5742979338842975,
"calib/mu_c": 0.693076923076923,
"calib/mu_w": 0.4847833333333333,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.18871900826446286,
"calib/std_conf": 0.35634054730957604,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.574974358974359,
"calib/step_q_c_n": 195.0,
"calib/step_q_gap": 0.22295053698483025,
"calib/step_q_w": 0.35202382198952875,
"calib/step_q_w_n": 382.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3017.0,
"completions/max_terminated_length": 3017.0,
"completions/mean_length": 427.49609375,
"completions/mean_terminated_length": 437.7560119628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.00546091515570879,
"kl": 0.27142333984375,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0057,
"num_tokens": 28382326.0,
"reward": 0.2984740138053894,
"reward_std": 0.2718273997306824,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6555015444755554,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": -0.324178546667099,
"step": 127
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.584051724137931,
"calib/avg_num_step_conf": 1.6640625,
"calib/ece": 0.24991666666666662,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.2833333333333333,
"calib/gap": 0.12667408231368193,
"calib/mean_conf": 0.5155,
"calib/mu_c": 0.580948275862069,
"calib/mu_w": 0.45427419354838705,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.14104166666666668,
"calib/std_conf": 0.37323774818025396,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.6690751445086706,
"calib/step_q_c_n": 173.0,
"calib/step_q_gap": 0.18159135004226734,
"calib/step_q_w": 0.4874837944664032,
"calib/step_q_w_n": 253.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2049.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 419.8671875,
"completions/mean_terminated_length": 428.2310791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.005378535017371178,
"kl": 0.268280029296875,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0027,
"num_tokens": 28596476.0,
"reward": 0.2697708010673523,
"reward_std": 0.2821349501609802,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6162769794464111,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": -0.3493916392326355,
"step": 128
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.610390866873065,
"calib/avg_num_step_conf": 1.91796875,
"calib/ece": 0.24255600000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.308,
"calib/gap": 0.11783578431372543,
"calib/mean_conf": 0.597436,
"calib/mu_c": 0.6511691176470588,
"calib/mu_w": 0.5333333333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.14799600000000002,
"calib/std_conf": 0.3353037874883014,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.643483193277311,
"calib/step_q_c_n": 238.0,
"calib/step_q_gap": 0.07872128851540627,
"calib/step_q_w": 0.5647619047619047,
"calib/step_q_w_n": 252.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2702.0,
"completions/max_terminated_length": 2702.0,
"completions/mean_length": 391.97265625,
"completions/mean_terminated_length": 393.50982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1376,
"grad_norm": 0.005064961966127157,
"kl": 0.27825927734375,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0205,
"num_tokens": 28799205.0,
"reward": 0.31468018889427185,
"reward_std": 0.26014626026153564,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6678237915039062,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.336119681596756,
"step": 129
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6901665989435188,
"calib/avg_num_step_conf": 1.55859375,
"calib/ece": 0.17555102040816326,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.2530612244897959,
"calib/gap": 0.24424488690234325,
"calib/mean_conf": 0.5473877551020409,
"calib/mu_c": 0.6540579710144928,
"calib/mu_w": 0.40981308411214956,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.07983673469387752,
"calib/std_conf": 0.3549701232801023,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6881642512077294,
"calib/step_q_c_n": 207.0,
"calib/step_q_gap": 0.23933091787439603,
"calib/step_q_w": 0.44883333333333336,
"calib/step_q_w_n": 192.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2639.0,
"completions/max_terminated_length": 2639.0,
"completions/mean_length": 400.79296875,
"completions/mean_terminated_length": 403.9488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.00564991869032383,
"kl": 0.272186279296875,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0351,
"num_tokens": 29007096.0,
"reward": 0.33700209856033325,
"reward_std": 0.25782716274261475,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6895445585250854,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": -0.30929034948349,
"step": 130
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7038740245261984,
"calib/avg_num_step_conf": 2.40625,
"calib/ece": 0.1713616935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.16129032258064516,
"calib/gap": 0.23632934782608694,
"calib/mean_conf": 0.48767056451612906,
"calib/mu_c": 0.636329347826087,
"calib/mu_w": 0.4,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14403225806451614,
"calib/std_conf": 0.3277529285265745,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5875722532588454,
"calib/step_q_c_n": 179.0,
"calib/step_q_gap": 0.1450061205357333,
"calib/step_q_w": 0.4425661327231121,
"calib/step_q_w_n": 437.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2405.0,
"completions/max_terminated_length": 2405.0,
"completions/mean_length": 394.24609375,
"completions/mean_terminated_length": 398.92095947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.0058201816864311695,
"kl": 0.248626708984375,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0142,
"num_tokens": 29214231.0,
"reward": 0.3132360577583313,
"reward_std": 0.2416331171989441,
"rewards/accuracy_reward_step": 0.359375,
"rewards/final_brier_reward_step": 0.717965841293335,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.353993684053421,
"step": 131
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.671569327431101,
"calib/avg_num_step_conf": 1.7734375,
"calib/ece": 0.22041152263374486,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.3950617283950617,
"calib/gap": 0.2276802798800513,
"calib/mean_conf": 0.5984362139917696,
"calib/mu_c": 0.6865100671140939,
"calib/mu_w": 0.4588297872340426,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.10283950617283948,
"calib/std_conf": 0.38021656034767987,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.67275956284153,
"calib/step_q_c_n": 244.0,
"calib/step_q_gap": 0.1910928961748634,
"calib/step_q_w": 0.48166666666666663,
"calib/step_q_w_n": 210.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2912.0,
"completions/max_terminated_length": 2912.0,
"completions/mean_length": 399.59765625,
"completions/mean_terminated_length": 405.94049072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.1408,
"grad_norm": 0.0052103460766375065,
"kl": 0.26055908203125,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0196,
"num_tokens": 29422120.0,
"reward": 0.33788812160491943,
"reward_std": 0.274682879447937,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6694461107254028,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": -0.29210734367370605,
"step": 132
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6606093189964157,
"calib/avg_num_step_conf": 1.984375,
"calib/ece": 0.2209726530612245,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.1914083870967742,
"calib/mean_conf": 0.4929048979591837,
"calib/mu_c": 0.614,
"calib/mu_w": 0.4225916129032258,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.17326530612244895,
"calib/std_conf": 0.338006179596465,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5865408805031448,
"calib/step_q_c_n": 159.0,
"calib/step_q_gap": 0.12561910399884685,
"calib/step_q_w": 0.46092177650429794,
"calib/step_q_w_n": 349.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2956.0,
"completions/max_terminated_length": 2956.0,
"completions/mean_length": 501.3359375,
"completions/mean_terminated_length": 507.2806396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.0043398430570960045,
"kl": 0.2178497314453125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0081,
"num_tokens": 29656806.0,
"reward": 0.28701332211494446,
"reward_std": 0.26014071702957153,
"rewards/accuracy_reward_step": 0.3515625,
"rewards/final_brier_reward_step": 0.6839252710342407,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.3684923052787781,
"step": 133
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6431696162258898,
"calib/avg_num_step_conf": 2.0703125,
"calib/ece": 0.22766129032258062,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.23387096774193547,
"calib/gap": 0.17175979319944323,
"calib/mean_conf": 0.513467741935484,
"calib/mu_c": 0.6111214953271028,
"calib/mu_w": 0.43936170212765957,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.15483870967741933,
"calib/std_conf": 0.3590434072842112,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6530263157894737,
"calib/step_q_c_n": 152.0,
"calib/step_q_gap": 0.2766042699340945,
"calib/step_q_w": 0.3764220458553792,
"calib/step_q_w_n": 378.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2827.0,
"completions/max_terminated_length": 2827.0,
"completions/mean_length": 461.46875,
"completions/mean_terminated_length": 463.2784729003906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.00501696951687336,
"kl": 0.22442626953125,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0306,
"num_tokens": 29883894.0,
"reward": 0.28798556327819824,
"reward_std": 0.2733740508556366,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6685503721237183,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.36679795384407043,
"step": 134
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6055826889160223,
"calib/avg_num_step_conf": 2.04296875,
"calib/ece": 0.2603703703703704,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.23045267489711935,
"calib/gap": 0.13574481074481087,
"calib/mean_conf": 0.5422633744855968,
"calib/mu_c": 0.6126495726495728,
"calib/mu_w": 0.4769047619047619,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.16057613168724283,
"calib/std_conf": 0.3544975877031863,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6573026737967915,
"calib/step_q_c_n": 187.0,
"calib/step_q_gap": 0.1926002928444106,
"calib/step_q_w": 0.4647023809523809,
"calib/step_q_w_n": 336.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1355.0,
"completions/max_terminated_length": 1355.0,
"completions/mean_length": 422.26953125,
"completions/mean_terminated_length": 432.4040222167969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.144,
"grad_norm": 0.005108509678393602,
"kl": 0.24139404296875,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0296,
"num_tokens": 30097875.0,
"reward": 0.2935692071914673,
"reward_std": 0.26770222187042236,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.650739848613739,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.34485143423080444,
"step": 135
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7425729442970821,
"calib/avg_num_step_conf": 2.28125,
"calib/ece": 0.16321285140562247,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.23293172690763053,
"calib/gap": 0.2934025198938993,
"calib/mean_conf": 0.5302008032128516,
"calib/mu_c": 0.7010576923076924,
"calib/mu_w": 0.4076551724137931,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.1378714859437751,
"calib/std_conf": 0.35086634006945205,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6574111675126904,
"calib/step_q_c_n": 197.0,
"calib/step_q_gap": 0.22448265760743624,
"calib/step_q_w": 0.4329285099052541,
"calib/step_q_w_n": 387.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2609.0,
"completions/max_terminated_length": 2609.0,
"completions/mean_length": 461.68359375,
"completions/mean_terminated_length": 463.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.005101132206618786,
"kl": 0.210723876953125,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0492,
"num_tokens": 30324554.0,
"reward": 0.337179958820343,
"reward_std": 0.23052147030830383,
"rewards/accuracy_reward_step": 0.40625,
"rewards/final_brier_reward_step": 0.7274148464202881,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.32571113109588623,
"step": 136
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6756081525312295,
"calib/avg_num_step_conf": 2.140625,
"calib/ece": 0.20946194331983808,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.23481781376518218,
"calib/gap": 0.20794786324786324,
"calib/mean_conf": 0.522117004048583,
"calib/mu_c": 0.6315632478632478,
"calib/mu_w": 0.42361538461538456,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.12894736842105267,
"calib/std_conf": 0.3421008175180087,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6059450704225353,
"calib/step_q_c_n": 213.0,
"calib/step_q_gap": 0.12614208534790844,
"calib/step_q_w": 0.4798029850746269,
"calib/step_q_w_n": 335.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 442.06640625,
"completions/mean_terminated_length": 445.5472412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.004609386902302504,
"kl": 0.2218017578125,
"learning_rate": 1.75e-06,
"loss": 0.0563,
"num_tokens": 30544707.0,
"reward": 0.3316650390625,
"reward_std": 0.24586954712867737,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6928571462631226,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.30921459197998047,
"step": 137
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6779999999999999,
"calib/avg_num_step_conf": 1.72265625,
"calib/ece": 0.2064216326530612,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.2653061224489796,
"calib/gap": 0.20887975438596507,
"calib/mean_conf": 0.5405171428571428,
"calib/mu_c": 0.6215113333333335,
"calib/mu_w": 0.41263157894736846,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.0673469387755102,
"calib/std_conf": 0.34302588328591416,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.623115294117647,
"calib/step_q_c_n": 255.0,
"calib/step_q_gap": 0.21311529411764701,
"calib/step_q_w": 0.41,
"calib/step_q_w_n": 186.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2834.0,
"completions/max_terminated_length": 2834.0,
"completions/mean_length": 421.92578125,
"completions/mean_terminated_length": 430.3306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.1472,
"grad_norm": 0.0051317536272108555,
"kl": 0.2580108642578125,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0022,
"num_tokens": 30757056.0,
"reward": 0.33365753293037415,
"reward_std": 0.2487613558769226,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6757357120513916,
"rewards/format_reward_step": 0.921875,
"rewards/step_l1_reward": -0.30998310446739197,
"step": 138
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6721259842519686,
"calib/avg_num_step_conf": 1.8359375,
"calib/ece": 0.21853174603174597,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.29365079365079366,
"calib/gap": 0.2081417322834645,
"calib/mean_conf": 0.5928968253968254,
"calib/mu_c": 0.6961417322834644,
"calib/mu_w": 0.48799999999999993,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1537301587301587,
"calib/std_conf": 0.35177207468701155,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6860891122278059,
"calib/step_q_c_n": 199.0,
"calib/step_q_gap": 0.1736145734824185,
"calib/step_q_w": 0.5124745387453874,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1136.0,
"completions/max_terminated_length": 1136.0,
"completions/mean_length": 369.76171875,
"completions/mean_terminated_length": 374.1462707519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.005164094269275665,
"kl": 0.253387451171875,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0042,
"num_tokens": 30954811.0,
"reward": 0.3282237648963928,
"reward_std": 0.2631811499595642,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7040168046951294,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3413192629814148,
"step": 139
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7424364934977428,
"calib/avg_num_step_conf": 2.0859375,
"calib/ece": 0.16077279999999988,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.324,
"calib/gap": 0.2902782561821979,
"calib/mean_conf": 0.5875471999999999,
"calib/mu_c": 0.7001751633986928,
"calib/mu_w": 0.4098969072164949,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.06815999999999989,
"calib/std_conf": 0.3568073439997557,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6992158914728681,
"calib/step_q_c_n": 258.0,
"calib/step_q_gap": 0.25446951466127393,
"calib/step_q_w": 0.4447463768115942,
"calib/step_q_w_n": 276.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 400.9375,
"completions/mean_terminated_length": 404.094482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.004881748929619789,
"kl": 0.25836181640625,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0058,
"num_tokens": 31162467.0,
"reward": 0.3957335948944092,
"reward_std": 0.2444041669368744,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7389832139015198,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.25845348834991455,
"step": 140
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6920915295062224,
"calib/avg_num_step_conf": 1.95703125,
"calib/ece": 0.18542510121457484,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.31983805668016196,
"calib/gap": 0.2266372273517998,
"calib/mean_conf": 0.5969230769230769,
"calib/mu_c": 0.6941843971631205,
"calib/mu_w": 0.4675471698113207,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.10574898785425096,
"calib/std_conf": 0.3524584653421482,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6735418326693228,
"calib/step_q_c_n": 251.0,
"calib/step_q_gap": 0.28961543266932277,
"calib/step_q_w": 0.3839264,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2182.0,
"completions/max_terminated_length": 2182.0,
"completions/mean_length": 438.5625,
"completions/mean_terminated_length": 443.76287841796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1504,
"grad_norm": 0.005510457791388035,
"kl": 0.240234375,
"learning_rate": 1.638888888888889e-06,
"loss": -0.003,
"num_tokens": 31381835.0,
"reward": 0.35618430376052856,
"reward_std": 0.28215062618255615,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6942168474197388,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": -0.2795044183731079,
"step": 141
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6722083413415337,
"calib/avg_num_step_conf": 2.23828125,
"calib/ece": 0.18711999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.232,
"calib/gap": 0.20345057338714828,
"calib/mean_conf": 0.52816,
"calib/mu_c": 0.6331404958677684,
"calib/mu_w": 0.42968992248062016,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.11563999999999997,
"calib/std_conf": 0.3454501619626194,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6281516587677726,
"calib/step_q_c_n": 211.0,
"calib/step_q_gap": 0.1549008300384908,
"calib/step_q_w": 0.47325082872928176,
"calib/step_q_w_n": 362.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1431.0,
"completions/max_terminated_length": 1431.0,
"completions/mean_length": 427.38671875,
"completions/mean_terminated_length": 432.4545593261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.00481997337192297,
"kl": 0.229736328125,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0145,
"num_tokens": 31596406.0,
"reward": 0.31411874294281006,
"reward_std": 0.25212931632995605,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6740629076957703,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": -0.3255128860473633,
"step": 142
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7154727224576272,
"calib/avg_num_step_conf": 2.26953125,
"calib/ece": 0.21825203252032527,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.2764227642276423,
"calib/gap": 0.2252635063559324,
"calib/mean_conf": 0.6151626016260164,
"calib/mu_c": 0.7323728813559324,
"calib/mu_w": 0.507109375,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.17686991869918706,
"calib/std_conf": 0.32688567092088044,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.6581127450980392,
"calib/step_q_c_n": 204.0,
"calib/step_q_gap": 0.22476703333853437,
"calib/step_q_w": 0.4333457117595048,
"calib/step_q_w_n": 377.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2672.0,
"completions/max_terminated_length": 2672.0,
"completions/mean_length": 445.8125,
"completions/mean_terminated_length": 449.3228454589844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.00506249163299799,
"kl": 0.212554931640625,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0262,
"num_tokens": 31817870.0,
"reward": 0.32942867279052734,
"reward_std": 0.2550898492336273,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7016031742095947,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3247770071029663,
"step": 143
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.685218253968254,
"calib/avg_num_step_conf": 2.16796875,
"calib/ece": 0.17514056224899593,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3172690763052209,
"calib/gap": 0.2356011904761905,
"calib/mean_conf": 0.5937751004016064,
"calib/mu_c": 0.693125,
"calib/mu_w": 0.4575238095238095,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.09530120481927706,
"calib/std_conf": 0.34731472137706876,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.65,
"calib/step_q_c_n": 272.0,
"calib/step_q_gap": 0.2069515901060071,
"calib/step_q_w": 0.4430484098939929,
"calib/step_q_w_n": 283.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1496.0,
"completions/max_terminated_length": 1496.0,
"completions/mean_length": 405.92578125,
"completions/mean_terminated_length": 412.36907958984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.1536,
"grad_norm": 0.0052193524315953255,
"kl": 0.250244140625,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0024,
"num_tokens": 32025915.0,
"reward": 0.36195603013038635,
"reward_std": 0.24744543433189392,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7256039381027222,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.30794191360473633,
"step": 144
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6365524402907581,
"calib/avg_num_step_conf": 2.0546875,
"calib/ece": 0.2133864541832669,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2788844621513944,
"calib/gap": 0.15247988058151618,
"calib/mean_conf": 0.5821513944223108,
"calib/mu_c": 0.6471527777777778,
"calib/mu_w": 0.4946728971962616,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11091633466135459,
"calib/std_conf": 0.33655412384183914,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.630611880046136,
"calib/step_q_c_n": 289.0,
"calib/step_q_gap": 0.12860597287314024,
"calib/step_q_w": 0.5020059071729958,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2292.0,
"completions/max_terminated_length": 2292.0,
"completions/mean_length": 402.51953125,
"completions/mean_terminated_length": 402.51953125,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.005096918903291225,
"kl": 0.230438232421875,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0063,
"num_tokens": 32231664.0,
"reward": 0.3254355192184448,
"reward_std": 0.24309183657169342,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6948660612106323,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.35024493932724,
"step": 145
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6668700427089689,
"calib/avg_num_step_conf": 2.66015625,
"calib/ece": 0.2104838709677419,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.23790322580645162,
"calib/gap": 0.1873215375228796,
"calib/mean_conf": 0.5492741935483871,
"calib/mu_c": 0.6618181818181817,
"calib/mu_w": 0.47449664429530214,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.18028225806451614,
"calib/std_conf": 0.3333231343826471,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6047111111111111,
"calib/step_q_c_n": 225.0,
"calib/step_q_gap": 0.1556343567251462,
"calib/step_q_w": 0.4490767543859649,
"calib/step_q_w_n": 456.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2363.0,
"completions/max_terminated_length": 2363.0,
"completions/mean_length": 431.9140625,
"completions/mean_terminated_length": 440.5179443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.005165283568203449,
"kl": 0.2237548828125,
"learning_rate": 1.5e-06,
"loss": -0.0307,
"num_tokens": 32449450.0,
"reward": 0.29126691818237305,
"reward_std": 0.24586060643196106,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.6712952852249146,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.35438641905784607,
"step": 146
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6534571723426212,
"calib/avg_num_step_conf": 2.234375,
"calib/ece": 0.27165282258064516,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.24193548387096775,
"calib/gap": 0.19355110423116634,
"calib/mean_conf": 0.5237504032258066,
"calib/mu_c": 0.6431589473684212,
"calib/mu_w": 0.4496078431372548,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.20616935483870968,
"calib/std_conf": 0.35710478809063423,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5963912087912089,
"calib/step_q_c_n": 182.0,
"calib/step_q_gap": 0.09788855921855927,
"calib/step_q_w": 0.4985026495726496,
"calib/step_q_w_n": 390.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2664.0,
"completions/max_terminated_length": 2664.0,
"completions/mean_length": 467.9609375,
"completions/mean_terminated_length": 475.388916015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1568,
"grad_norm": 0.0046214209869503975,
"kl": 0.192108154296875,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0109,
"num_tokens": 32672928.0,
"reward": 0.2987235188484192,
"reward_std": 0.2539418339729309,
"rewards/accuracy_reward_step": 0.37109375,
"rewards/final_brier_reward_step": 0.6737816333770752,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.3396158218383789,
"step": 147
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6424822881133561,
"calib/avg_num_step_conf": 2.50390625,
"calib/ece": 0.22454183266932276,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3187250996015936,
"calib/gap": 0.18160062975596952,
"calib/mean_conf": 0.5973705179282869,
"calib/mu_c": 0.6718918918918919,
"calib/mu_w": 0.4902912621359224,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11613545816733076,
"calib/std_conf": 0.35981438631180784,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6342083892617449,
"calib/step_q_c_n": 298.0,
"calib/step_q_gap": 0.21874065359605005,
"calib/step_q_w": 0.4154677356656949,
"calib/step_q_w_n": 343.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1738.0,
"completions/max_terminated_length": 1738.0,
"completions/mean_length": 413.1953125,
"completions/mean_terminated_length": 414.8157043457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.005058152601122856,
"kl": 0.229888916015625,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0145,
"num_tokens": 32883818.0,
"reward": 0.3459177315235138,
"reward_std": 0.2623218894004822,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6996320486068726,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.31795281171798706,
"step": 148
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6855576441102758,
"calib/avg_num_step_conf": 2.48828125,
"calib/ece": 0.18735177865612646,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2134387351778656,
"calib/gap": 0.23046115288220548,
"calib/mean_conf": 0.5181818181818183,
"calib/mu_c": 0.6393333333333333,
"calib/mu_w": 0.40887218045112783,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.11561264822134389,
"calib/std_conf": 0.3482950661553457,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6032245535714286,
"calib/step_q_c_n": 224.0,
"calib/step_q_gap": 0.2076306068401937,
"calib/step_q_w": 0.3955939467312349,
"calib/step_q_w_n": 413.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1472.0,
"completions/max_terminated_length": 1472.0,
"completions/mean_length": 441.078125,
"completions/mean_terminated_length": 444.5511779785156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.004603913053870201,
"kl": 0.223388671875,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0171,
"num_tokens": 33101190.0,
"reward": 0.32823610305786133,
"reward_std": 0.2496287077665329,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7171752452850342,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.34820303320884705,
"step": 149
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7085946883758174,
"calib/avg_num_step_conf": 1.8984375,
"calib/ece": 0.25006448979591833,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.3836734693877551,
"calib/gap": 0.20303579340718014,
"calib/mean_conf": 0.6480579591836734,
"calib/mu_c": 0.7533050847457626,
"calib/mu_w": 0.5502692913385825,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.20824489795918363,
"calib/std_conf": 0.34109737443260196,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6784527363184079,
"calib/step_q_c_n": 201.0,
"calib/step_q_gap": 0.1594004556166535,
"calib/step_q_w": 0.5190522807017544,
"calib/step_q_w_n": 285.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2999.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 392.75390625,
"completions/mean_terminated_length": 397.41107177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.16,
"grad_norm": 0.005182725843042135,
"kl": 0.2346649169921875,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.011,
"num_tokens": 33306695.0,
"reward": 0.32051196694374084,
"reward_std": 0.2669872045516968,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6684824228286743,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.30870845913887024,
"step": 150
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6268835616438355,
"calib/avg_num_step_conf": 1.89453125,
"calib/ece": 0.2838617886178862,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.3130081300813008,
"calib/gap": 0.16171780821917825,
"calib/mean_conf": 0.578821138211382,
"calib/mu_c": 0.6748000000000002,
"calib/mu_w": 0.5130821917808219,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.22808943089430894,
"calib/std_conf": 0.3483547750631152,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6790375586854461,
"calib/step_q_c_n": 142.0,
"calib/step_q_gap": 0.2337022817175161,
"calib/step_q_w": 0.44533527696793,
"calib/step_q_w_n": 343.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1756.0,
"completions/max_terminated_length": 1756.0,
"completions/mean_length": 438.80859375,
"completions/mean_terminated_length": 447.5498046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.005079971626400948,
"kl": 0.1945037841796875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0214,
"num_tokens": 33526054.0,
"reward": 0.27748289704322815,
"reward_std": 0.25188490748405457,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.653252363204956,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.36625534296035767,
"step": 151
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6625622484480523,
"calib/avg_num_step_conf": 2.30078125,
"calib/ece": 0.20938524590163926,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.2827868852459016,
"calib/gap": 0.20896036564567838,
"calib/mean_conf": 0.5319262295081967,
"calib/mu_c": 0.649252336448598,
"calib/mu_w": 0.44029197080291965,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.15139344262295074,
"calib/std_conf": 0.3562808495249814,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6407761111111111,
"calib/step_q_c_n": 180.0,
"calib/step_q_gap": 0.1892076514534094,
"calib/step_q_w": 0.4515684596577017,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2099.0,
"completions/max_terminated_length": 2099.0,
"completions/mean_length": 420.76953125,
"completions/mean_terminated_length": 425.7589111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.004639812279492617,
"kl": 0.215057373046875,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0143,
"num_tokens": 33739163.0,
"reward": 0.29684919118881226,
"reward_std": 0.2769307494163513,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6758453249931335,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": -0.35324063897132874,
"step": 152
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6043184183142561,
"calib/avg_num_step_conf": 2.1875,
"calib/ece": 0.2664112903225807,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.31048387096774194,
"calib/gap": 0.13104838709677424,
"calib/mean_conf": 0.5618145161290322,
"calib/mu_c": 0.6273387096774194,
"calib/mu_w": 0.4962903225806452,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.16411290322580654,
"calib/std_conf": 0.3539744584128025,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6189377151799687,
"calib/step_q_c_n": 213.0,
"calib/step_q_gap": 0.19640169212521358,
"calib/step_q_w": 0.42253602305475507,
"calib/step_q_w_n": 347.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2387.0,
"completions/max_terminated_length": 2387.0,
"completions/mean_length": 425.14453125,
"completions/mean_terminated_length": 430.185791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.1632,
"grad_norm": 0.005156941246241331,
"kl": 0.226715087890625,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0357,
"num_tokens": 33955320.0,
"reward": 0.28570637106895447,
"reward_std": 0.2652091979980469,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6410496234893799,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.35479307174682617,
"step": 153
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6710442837203401,
"calib/avg_num_step_conf": 1.97265625,
"calib/ece": 0.202806324110672,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2608695652173913,
"calib/gap": 0.21653977921583556,
"calib/mean_conf": 0.5403557312252965,
"calib/mu_c": 0.6618918918918919,
"calib/mu_w": 0.44535211267605634,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15221343873517792,
"calib/std_conf": 0.3526369774742473,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6392572463768116,
"calib/step_q_c_n": 184.0,
"calib/step_q_gap": 0.16298248002167143,
"calib/step_q_w": 0.47627476635514016,
"calib/step_q_w_n": 321.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1054.0,
"completions/max_terminated_length": 1054.0,
"completions/mean_length": 391.03515625,
"completions/mean_terminated_length": 394.1141662597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.005356749519705772,
"kl": 0.24224853515625,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0162,
"num_tokens": 34159865.0,
"reward": 0.3283141255378723,
"reward_std": 0.2631031572818756,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7153323888778687,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.34229791164398193,
"step": 154
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5638262154176739,
"calib/avg_num_step_conf": 2.046875,
"calib/ece": 0.2869391129032258,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.23387096774193547,
"calib/gap": 0.0789156459844213,
"calib/mean_conf": 0.5112866935483872,
"calib/mu_c": 0.5577450980392158,
"calib/mu_w": 0.4788294520547945,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.19346774193548386,
"calib/std_conf": 0.34740902862434725,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5410261780104713,
"calib/step_q_c_n": 191.0,
"calib/step_q_gap": 0.008083235067528394,
"calib/step_q_w": 0.5329429429429429,
"calib/step_q_w_n": 333.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 424.2890625,
"completions/mean_terminated_length": 425.9529724121094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.0087783457711339,
"kl": 0.311614990234375,
"learning_rate": 1.25e-06,
"loss": 0.0113,
"num_tokens": 34375699.0,
"reward": 0.2517399787902832,
"reward_std": 0.2683244049549103,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.6324470043182373,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3984982967376709,
"step": 155
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6825293350717081,
"calib/avg_num_step_conf": 2.5859375,
"calib/ece": 0.22645161290322585,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3225806451612903,
"calib/gap": 0.21190873533246407,
"calib/mean_conf": 0.5745967741935484,
"calib/mu_c": 0.6856779661016948,
"calib/mu_w": 0.4737692307692308,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1626209677419355,
"calib/std_conf": 0.3567401674697824,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6028136882129278,
"calib/step_q_c_n": 263.0,
"calib/step_q_gap": 0.23851218445352929,
"calib/step_q_w": 0.3643015037593985,
"calib/step_q_w_n": 399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 430.71484375,
"completions/mean_terminated_length": 434.1062927246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1664,
"grad_norm": 0.004716483876109123,
"kl": 0.216949462890625,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.008,
"num_tokens": 34590722.0,
"reward": 0.3178238570690155,
"reward_std": 0.2589898109436035,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6852397918701172,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.3331858515739441,
"step": 156
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7456790123456791,
"calib/avg_num_step_conf": 2.25390625,
"calib/ece": 0.1397453815261045,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.285140562248996,
"calib/gap": 0.30451384015594535,
"calib/mean_conf": 0.5502136546184738,
"calib/mu_c": 0.6896296296296296,
"calib/mu_w": 0.38511578947368424,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.07389518072289164,
"calib/std_conf": 0.3584324278796773,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6650764525993883,
"calib/step_q_c_n": 327.0,
"calib/step_q_gap": 0.1928768525993883,
"calib/step_q_w": 0.4721996,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1766.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 415.85546875,
"completions/mean_terminated_length": 420.7865905761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.005070955958217382,
"kl": 0.218902587890625,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0121,
"num_tokens": 34800909.0,
"reward": 0.3705405592918396,
"reward_std": 0.23960906267166138,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7493710517883301,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.30672743916511536,
"step": 157
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.62321875,
"calib/avg_num_step_conf": 2.14453125,
"calib/ece": 0.2537154150197629,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.308300395256917,
"calib/gap": 0.15088875000000013,
"calib/mean_conf": 0.5915810276679843,
"calib/mu_c": 0.6679200000000002,
"calib/mu_w": 0.51703125,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.17561264822134398,
"calib/std_conf": 0.35225511544819077,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6522580645161291,
"calib/step_q_c_n": 248.0,
"calib/step_q_gap": 0.18049726717393644,
"calib/step_q_w": 0.4717607973421927,
"calib/step_q_w_n": 301.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2380.0,
"completions/max_terminated_length": 2380.0,
"completions/mean_length": 403.03515625,
"completions/mean_terminated_length": 404.61572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0048288567923009396,
"kl": 0.250335693359375,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0188,
"num_tokens": 35009326.0,
"reward": 0.2849690318107605,
"reward_std": 0.29535138607025146,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6730554699897766,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.39530491828918457,
"step": 158
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6866359447004607,
"calib/avg_num_step_conf": 1.89453125,
"calib/ece": 0.19579999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.272,
"calib/gap": 0.2196927803379417,
"calib/mean_conf": 0.59492,
"calib/mu_c": 0.7056451612903226,
"calib/mu_w": 0.4859523809523809,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14736,
"calib/std_conf": 0.337823909159787,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7478567839195981,
"calib/step_q_c_n": 199.0,
"calib/step_q_gap": 0.2496672734300876,
"calib/step_q_w": 0.4981895104895105,
"calib/step_q_w_n": 286.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 395.984375,
"completions/mean_terminated_length": 400.67987060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1696,
"grad_norm": 0.005460195243358612,
"kl": 0.2312164306640625,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0123,
"num_tokens": 35215482.0,
"reward": 0.34846895933151245,
"reward_std": 0.2540527582168579,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7034000158309937,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.29474329948425293,
"step": 159
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.660592998955068,
"calib/avg_num_step_conf": 2.390625,
"calib/ece": 0.21722177419354838,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.29435483870967744,
"calib/gap": 0.2003335945663533,
"calib/mean_conf": 0.5518104838709676,
"calib/mu_c": 0.6584396551724139,
"calib/mu_w": 0.45810606060606057,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1506451612903226,
"calib/std_conf": 0.35852603772780317,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6219877637130801,
"calib/step_q_c_n": 237.0,
"calib/step_q_gap": 0.17720109704641335,
"calib/step_q_w": 0.4447866666666667,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 445.890625,
"completions/mean_terminated_length": 447.6392517089844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.004912951961159706,
"kl": 0.2237091064453125,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0148,
"num_tokens": 35434470.0,
"reward": 0.31076303124427795,
"reward_std": 0.24745705723762512,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6821966171264648,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.34270179271698,
"step": 160
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6679886109416311,
"calib/avg_num_step_conf": 2.17578125,
"calib/ece": 0.20129032258064516,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.29435483870967744,
"calib/gap": 0.21405735204392912,
"calib/mean_conf": 0.559516129032258,
"calib/mu_c": 0.6449664429530201,
"calib/mu_w": 0.43090909090909096,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.08000000000000002,
"calib/std_conf": 0.3587587356218878,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6135987500000001,
"calib/step_q_c_n": 320.0,
"calib/step_q_gap": 0.15349832805907182,
"calib/step_q_w": 0.46010042194092826,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1717.0,
"completions/max_terminated_length": 1717.0,
"completions/mean_length": 387.46484375,
"completions/mean_terminated_length": 390.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.005282443482428789,
"kl": 0.236663818359375,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0105,
"num_tokens": 35637581.0,
"reward": 0.3536580801010132,
"reward_std": 0.21946199238300323,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7094613313674927,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.31152018904685974,
"step": 161
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6402298850574712,
"calib/avg_num_step_conf": 2.17578125,
"calib/ece": 0.22977600000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.304,
"calib/gap": 0.16419573070607563,
"calib/mean_conf": 0.590624,
"calib/mu_c": 0.6595862068965518,
"calib/mu_w": 0.49539047619047616,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12020000000000003,
"calib/std_conf": 0.35114941922776977,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6168284280936455,
"calib/step_q_c_n": 299.0,
"calib/step_q_gap": 0.09871718778356797,
"calib/step_q_w": 0.5181112403100775,
"calib/step_q_w_n": 258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 411.40234375,
"completions/mean_terminated_length": 413.0157165527344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.1728,
"grad_norm": 0.005542648956179619,
"kl": 0.2258453369140625,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0398,
"num_tokens": 35847044.0,
"reward": 0.3267264664173126,
"reward_std": 0.22407203912734985,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.687700629234314,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.34049761295318604,
"step": 162
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6858924141532838,
"calib/avg_num_step_conf": 2.76171875,
"calib/ece": 0.1811646586345381,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.23293172690763053,
"calib/gap": 0.2193458676067372,
"calib/mean_conf": 0.5221285140562248,
"calib/mu_c": 0.6436936936936937,
"calib/mu_w": 0.4243478260869565,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12875502008032125,
"calib/std_conf": 0.34453491081351667,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6264636363636363,
"calib/step_q_c_n": 220.0,
"calib/step_q_gap": 0.21332195258540226,
"calib/step_q_w": 0.41314168377823407,
"calib/step_q_w_n": 487.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2729.0,
"completions/max_terminated_length": 2729.0,
"completions/mean_length": 437.140625,
"completions/mean_terminated_length": 442.3241271972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.00472784461453557,
"kl": 0.214599609375,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0082,
"num_tokens": 36063784.0,
"reward": 0.3299524486064911,
"reward_std": 0.2258731722831726,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.7127512097358704,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.33331501483917236,
"step": 163
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7229080475110102,
"calib/avg_num_step_conf": 2.49609375,
"calib/ece": 0.16824489795918357,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.27755102040816326,
"calib/gap": 0.2610036033631389,
"calib/mean_conf": 0.5477551020408162,
"calib/mu_c": 0.6830508474576271,
"calib/mu_w": 0.42204724409448824,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.11718367346938768,
"calib/std_conf": 0.3465201075304993,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5682908424908425,
"calib/step_q_c_n": 273.0,
"calib/step_q_gap": 0.12329084249084249,
"calib/step_q_w": 0.445,
"calib/step_q_w_n": 366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2211.0,
"completions/max_terminated_length": 2211.0,
"completions/mean_length": 425.19921875,
"completions/mean_terminated_length": 437.152587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.005297855939716101,
"kl": 0.23809814453125,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.058,
"num_tokens": 36278771.0,
"reward": 0.3335390090942383,
"reward_std": 0.2471492439508438,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7153980731964111,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.3303512632846832,
"step": 164
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6616358024691358,
"calib/avg_num_step_conf": 2.28125,
"calib/ece": 0.1992156862745098,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.24705882352941178,
"calib/gap": 0.19099999999999978,
"calib/mean_conf": 0.5538823529411765,
"calib/mu_c": 0.6549999999999999,
"calib/mu_w": 0.46400000000000013,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14125490196078427,
"calib/std_conf": 0.33801645435403216,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6717857142857143,
"calib/step_q_c_n": 252.0,
"calib/step_q_gap": 0.2090146299483649,
"calib/step_q_w": 0.4627710843373494,
"calib/step_q_w_n": 332.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1499.0,
"completions/max_terminated_length": 1499.0,
"completions/mean_length": 430.40625,
"completions/mean_terminated_length": 432.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.176,
"grad_norm": 0.005576414056122303,
"kl": 0.22052001953125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0166,
"num_tokens": 36494531.0,
"reward": 0.3354248106479645,
"reward_std": 0.2429950088262558,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7219324111938477,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.34327030181884766,
"step": 165
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7292634372926343,
"calib/avg_num_step_conf": 1.875,
"calib/ece": 0.1714170040485829,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.2874493927125506,
"calib/gap": 0.289745189117452,
"calib/mean_conf": 0.5408906882591091,
"calib/mu_c": 0.6699270072992701,
"calib/mu_w": 0.3801818181818181,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.078825910931174,
"calib/std_conf": 0.35956566268727164,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.686,
"calib/step_q_c_n": 240.0,
"calib/step_q_gap": 0.21722208333333343,
"calib/step_q_w": 0.4687779166666666,
"calib/step_q_w_n": 240.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 452.44921875,
"completions/mean_terminated_length": 459.6309814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.004611345008015633,
"kl": 0.208587646484375,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0402,
"num_tokens": 36716542.0,
"reward": 0.36253562569618225,
"reward_std": 0.23905181884765625,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7260754108428955,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.29787909984588623,
"step": 166
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6372400756143667,
"calib/avg_num_step_conf": 2.02734375,
"calib/ece": 0.23885375494071148,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.33201581027667987,
"calib/gap": 0.14594202898550745,
"calib/mean_conf": 0.6165612648221345,
"calib/mu_c": 0.6828985507246378,
"calib/mu_w": 0.5369565217391303,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1549802371541502,
"calib/std_conf": 0.33285510800716794,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6601388888888889,
"calib/step_q_c_n": 240.0,
"calib/step_q_gap": 0.10839193548387116,
"calib/step_q_w": 0.5517469534050178,
"calib/step_q_w_n": 279.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1262.0,
"completions/max_terminated_length": 1262.0,
"completions/mean_length": 396.23828125,
"completions/mean_terminated_length": 397.79217529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.005330167710781097,
"kl": 0.23419189453125,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0112,
"num_tokens": 36923587.0,
"reward": 0.3239728808403015,
"reward_std": 0.24499499797821045,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6964414119720459,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.35318315029144287,
"step": 167
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6354657247514389,
"calib/avg_num_step_conf": 2.953125,
"calib/ece": 0.20932270916334655,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2749003984063745,
"calib/gap": 0.16906658817373094,
"calib/mean_conf": 0.5298804780876494,
"calib/mu_c": 0.5999319727891156,
"calib/mu_w": 0.43086538461538465,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.0767729083665338,
"calib/std_conf": 0.33977316322449624,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5348456790123456,
"calib/step_q_c_n": 324.0,
"calib/step_q_gap": 0.1928317901234567,
"calib/step_q_w": 0.3420138888888889,
"calib/step_q_w_n": 432.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2954.0,
"completions/max_terminated_length": 2954.0,
"completions/mean_length": 485.12109375,
"completions/mean_terminated_length": 487.0235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.1792,
"grad_norm": 0.004749679937958717,
"kl": 0.204132080078125,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0393,
"num_tokens": 37152450.0,
"reward": 0.34512436389923096,
"reward_std": 0.25722041726112366,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7028480172157288,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3227555453777313,
"step": 168
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7194544057377049,
"calib/avg_num_step_conf": 2.0859375,
"calib/ece": 0.202956,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.356,
"calib/gap": 0.2582685706967212,
"calib/mean_conf": 0.596004,
"calib/mu_c": 0.7220390624999999,
"calib/mu_w": 0.46377049180327873,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14348,
"calib/std_conf": 0.3596031645911921,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6753700440528633,
"calib/step_q_c_n": 227.0,
"calib/step_q_gap": 0.22620717760335185,
"calib/step_q_w": 0.4491628664495114,
"calib/step_q_w_n": 307.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1589.0,
"completions/max_terminated_length": 1589.0,
"completions/mean_length": 412.7890625,
"completions/mean_terminated_length": 414.4078674316406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.004710485693067312,
"kl": 0.2314453125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0115,
"num_tokens": 37362308.0,
"reward": 0.3566637635231018,
"reward_std": 0.24371802806854248,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7182464599609375,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.29788774251937866,
"step": 169
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7219016393442623,
"calib/avg_num_step_conf": 2.09375,
"calib/ece": 0.17457449392712548,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.25101214574898784,
"calib/gap": 0.2550772327868852,
"calib/mean_conf": 0.5658303643724696,
"calib/mu_c": 0.6949180327868852,
"calib/mu_w": 0.43984080000000003,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.12323886639676111,
"calib/std_conf": 0.34377573486158725,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.6957510729613733,
"calib/step_q_c_n": 233.0,
"calib/step_q_gap": 0.21651014886896414,
"calib/step_q_w": 0.47924092409240915,
"calib/step_q_w_n": 303.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 454.77734375,
"completions/mean_terminated_length": 458.3582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.004924747161567211,
"kl": 0.2138824462890625,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0742,
"num_tokens": 37582883.0,
"reward": 0.3429233431816101,
"reward_std": 0.2477169930934906,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.717066764831543,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.31637632846832275,
"step": 170
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6727064657638755,
"calib/avg_num_step_conf": 2.40234375,
"calib/ece": 0.2091177165354331,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2283464566929134,
"calib/gap": 0.21798596859304487,
"calib/mean_conf": 0.4995437007874015,
"calib/mu_c": 0.6257009345794394,
"calib/mu_w": 0.40771496598639456,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14370078740157483,
"calib/std_conf": 0.34959338543154,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.626068376068376,
"calib/step_q_c_n": 234.0,
"calib/step_q_gap": 0.16297126320748356,
"calib/step_q_w": 0.4630971128608924,
"calib/step_q_w_n": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2297.0,
"completions/max_terminated_length": 2297.0,
"completions/mean_length": 449.28515625,
"completions/mean_terminated_length": 449.28515625,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1824,
"grad_norm": 0.0047374023124575615,
"kl": 0.2057952880859375,
"learning_rate": 8.055555555555557e-07,
"loss": -0.004,
"num_tokens": 37804796.0,
"reward": 0.33203035593032837,
"reward_std": 0.2442462295293808,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7250390648841858,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.34222835302352905,
"step": 171
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6917808219178081,
"calib/avg_num_step_conf": 2.28125,
"calib/ece": 0.17857142857142852,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3611111111111111,
"calib/gap": 0.2352390798655984,
"calib/mean_conf": 0.6316666666666667,
"calib/mu_c": 0.7306164383561644,
"calib/mu_w": 0.495377358490566,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11543650793650789,
"calib/std_conf": 0.34227587963188083,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6494339416058394,
"calib/step_q_c_n": 274.0,
"calib/step_q_gap": 0.1469052319284201,
"calib/step_q_w": 0.5025287096774194,
"calib/step_q_w_n": 310.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2550.0,
"completions/max_terminated_length": 2550.0,
"completions/mean_length": 399.25,
"completions/mean_terminated_length": 400.8157043457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.0047450498677790165,
"kl": 0.21240234375,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0381,
"num_tokens": 38010356.0,
"reward": 0.37122833728790283,
"reward_std": 0.24598166346549988,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7320832014083862,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.2990015149116516,
"step": 172
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6394942157654022,
"calib/avg_num_step_conf": 2.3984375,
"calib/ece": 0.26360655737704924,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.36885245901639346,
"calib/gap": 0.15228679042238358,
"calib/mean_conf": 0.6523770491803279,
"calib/mu_c": 0.7310169491525425,
"calib/mu_w": 0.5787301587301589,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.21618852459016402,
"calib/std_conf": 0.33808174702278915,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7052838427947598,
"calib/step_q_c_n": 229.0,
"calib/step_q_gap": 0.1858682583791753,
"calib/step_q_w": 0.5194155844155844,
"calib/step_q_w_n": 385.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2599.0,
"completions/max_terminated_length": 2599.0,
"completions/mean_length": 445.59765625,
"completions/mean_terminated_length": 449.1062927246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.00468338280916214,
"kl": 0.2205810546875,
"learning_rate": 7.5e-07,
"loss": 0.0231,
"num_tokens": 38227589.0,
"reward": 0.2847577929496765,
"reward_std": 0.29933086037635803,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.647951602935791,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3612484633922577,
"step": 173
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5515151515151515,
"calib/avg_num_step_conf": 2.625,
"calib/ece": 0.3007258064516129,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.24193548387096775,
"calib/gap": 0.051697635697635635,
"calib/mean_conf": 0.53,
"calib/mu_c": 0.5598095238095238,
"calib/mu_w": 0.5081118881118881,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.20366935483870968,
"calib/std_conf": 0.339860028114042,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5228379182156133,
"calib/step_q_c_n": 269.0,
"calib/step_q_gap": 0.00943014319493507,
"calib/step_q_w": 0.5134077750206782,
"calib/step_q_w_n": 403.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2556.0,
"completions/max_terminated_length": 2556.0,
"completions/mean_length": 503.828125,
"completions/mean_terminated_length": 505.803955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.1856,
"grad_norm": 0.0047071753069758415,
"kl": 0.1872100830078125,
"learning_rate": 7.222222222222222e-07,
"loss": 0.004,
"num_tokens": 38460801.0,
"reward": 0.25245046615600586,
"reward_std": 0.23629821836948395,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6287406086921692,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3980584740638733,
"step": 174
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7098421868395096,
"calib/avg_num_step_conf": 3.8125,
"calib/ece": 0.18236734693877554,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.23265306122448978,
"calib/gap": 0.26061716218120345,
"calib/mean_conf": 0.5146938775510204,
"calib/mu_c": 0.6753191489361703,
"calib/mu_w": 0.41470198675496683,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15669387755102043,
"calib/std_conf": 0.3448957372226458,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5815013888888889,
"calib/step_q_c_n": 240.0,
"calib/step_q_gap": 0.21499839975845414,
"calib/step_q_w": 0.3665029891304348,
"calib/step_q_w_n": 736.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 470.05859375,
"completions/mean_terminated_length": 479.4223327636719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.0049662720412015915,
"kl": 0.198516845703125,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0635,
"num_tokens": 38686960.0,
"reward": 0.3290756940841675,
"reward_std": 0.24869877099990845,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.7184125185012817,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.3266673684120178,
"step": 175
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.727698771816419,
"calib/avg_num_step_conf": 2.5625,
"calib/ece": 0.1468674698795181,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.27710843373493976,
"calib/gap": 0.27351519069166136,
"calib/mean_conf": 0.5694377510040161,
"calib/mu_c": 0.7001538461538462,
"calib/mu_w": 0.4266386554621849,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.09710843373493978,
"calib/std_conf": 0.3488830213566231,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6834782608695652,
"calib/step_q_c_n": 230.0,
"calib/step_q_gap": 0.34399469279444783,
"calib/step_q_w": 0.33948356807511737,
"calib/step_q_w_n": 426.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 459.83984375,
"completions/mean_terminated_length": 463.46063232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.0047350418753921986,
"kl": 0.19427490234375,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0135,
"num_tokens": 38908743.0,
"reward": 0.361766517162323,
"reward_std": 0.2339170277118683,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7320382595062256,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.302255243062973,
"step": 176
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6748285677319188,
"calib/avg_num_step_conf": 2.20703125,
"calib/ece": 0.18020080321285142,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.24497991967871485,
"calib/gap": 0.2059600207012548,
"calib/mean_conf": 0.5447791164658635,
"calib/mu_c": 0.6531355932203388,
"calib/mu_w": 0.44717557251908396,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1255421686746988,
"calib/std_conf": 0.34156157371721896,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.66644,
"calib/step_q_c_n": 230.0,
"calib/step_q_gap": 0.22691164179104473,
"calib/step_q_w": 0.4395283582089553,
"calib/step_q_w_n": 335.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2386.0,
"completions/max_terminated_length": 2386.0,
"completions/mean_length": 436.40234375,
"completions/mean_terminated_length": 438.1137390136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.1888,
"grad_norm": 0.005169384181499481,
"kl": 0.2012939453125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0235,
"num_tokens": 39124294.0,
"reward": 0.3260233402252197,
"reward_std": 0.24973660707473755,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7051289081573486,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.33980101346969604,
"step": 177
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.663415853963491,
"calib/avg_num_step_conf": 2.29296875,
"calib/ece": 0.21162055335968386,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.31225296442687744,
"calib/gap": 0.19892285571392843,
"calib/mean_conf": 0.6147826086956523,
"calib/mu_c": 0.7162096774193548,
"calib/mu_w": 0.5172868217054264,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.16814229249011864,
"calib/std_conf": 0.34112875319856484,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6415535714285714,
"calib/step_q_c_n": 280.0,
"calib/step_q_gap": 0.11552197533736619,
"calib/step_q_w": 0.5260315960912052,
"calib/step_q_w_n": 307.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1895.0,
"completions/max_terminated_length": 1895.0,
"completions/mean_length": 396.66015625,
"completions/mean_terminated_length": 399.7834777832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.005466687958687544,
"kl": 0.215423583984375,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0249,
"num_tokens": 39331911.0,
"reward": 0.3221646845340729,
"reward_std": 0.25131478905677795,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6939695477485657,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.340265154838562,
"step": 178
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6278143182393118,
"calib/avg_num_step_conf": 2.921875,
"calib/ece": 0.24039682539682533,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.30158730158730157,
"calib/gap": 0.15215785479382749,
"calib/mean_conf": 0.5884126984126985,
"calib/mu_c": 0.6693220338983051,
"calib/mu_w": 0.5171641791044777,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18027777777777773,
"calib/std_conf": 0.3388892048621255,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6327671232876714,
"calib/step_q_c_n": 219.0,
"calib/step_q_gap": 0.23298886241810618,
"calib/step_q_w": 0.3997782608695652,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 396.3046875,
"completions/mean_terminated_length": 401.00396728515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.005298725329339504,
"kl": 0.234222412109375,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0395,
"num_tokens": 39539629.0,
"reward": 0.29585397243499756,
"reward_std": 0.27797776460647583,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6789250373840332,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.37315452098846436,
"step": 179
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7227230850223745,
"calib/avg_num_step_conf": 1.98828125,
"calib/ece": 0.17453441295546562,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3076923076923077,
"calib/gap": 0.26643327191366145,
"calib/mean_conf": 0.5857894736842105,
"calib/mu_c": 0.7109160305343512,
"calib/mu_w": 0.4444827586206897,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.11497975708502028,
"calib/std_conf": 0.3553445101037254,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.6578779735682819,
"calib/step_q_c_n": 227.0,
"calib/step_q_gap": 0.21603400193707628,
"calib/step_q_w": 0.44184397163120565,
"calib/step_q_w_n": 282.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1876.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 462.7734375,
"completions/mean_terminated_length": 468.2608947753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.192,
"grad_norm": 0.004950664006173611,
"kl": 0.20599365234375,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0023,
"num_tokens": 39761955.0,
"reward": 0.35782623291015625,
"reward_std": 0.26714158058166504,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7177339792251587,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.29505032300949097,
"step": 180
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7027454780361757,
"calib/avg_num_step_conf": 2.36328125,
"calib/ece": 0.15698795180722888,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.26506024096385544,
"calib/gap": 0.23917829457364342,
"calib/mean_conf": 0.554578313253012,
"calib/mu_c": 0.66984496124031,
"calib/mu_w": 0.4306666666666666,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.0967469879518072,
"calib/std_conf": 0.34338487968159287,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6106597014925373,
"calib/step_q_c_n": 268.0,
"calib/step_q_gap": 0.18756375688323956,
"calib/step_q_w": 0.4230959446092978,
"calib/step_q_w_n": 337.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2942.0,
"completions/max_terminated_length": 2942.0,
"completions/mean_length": 426.63671875,
"completions/mean_terminated_length": 429.9960632324219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.005251111462712288,
"kl": 0.2293853759765625,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0224,
"num_tokens": 39977438.0,
"reward": 0.3561040759086609,
"reward_std": 0.26347535848617554,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7260953187942505,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3084184229373932,
"step": 181
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7578456318914333,
"calib/avg_num_step_conf": 1.99609375,
"calib/ece": 0.14294354838709683,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.2782258064516129,
"calib/gap": 0.2923748939779475,
"calib/mean_conf": 0.6222177419354838,
"calib/mu_c": 0.7601526717557253,
"calib/mu_w": 0.4677777777777778,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1184677419354839,
"calib/std_conf": 0.32491115561036443,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7211282608695652,
"calib/step_q_c_n": 230.0,
"calib/step_q_gap": 0.22710833204394237,
"calib/step_q_w": 0.4940199288256228,
"calib/step_q_w_n": 281.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 448.94921875,
"completions/mean_terminated_length": 450.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.005027151666581631,
"kl": 0.2074432373046875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0303,
"num_tokens": 40198529.0,
"reward": 0.369170606136322,
"reward_std": 0.26670369505882263,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.750970721244812,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3079419732093811,
"step": 182
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7034108231707318,
"calib/avg_num_step_conf": 2.23828125,
"calib/ece": 0.19454183266932268,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.29880478087649404,
"calib/gap": 0.2619651930894309,
"calib/mean_conf": 0.5507171314741035,
"calib/mu_c": 0.6843089430894309,
"calib/mu_w": 0.42234375,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12760956175298802,
"calib/std_conf": 0.3620177568025872,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6180842911877394,
"calib/step_q_c_n": 261.0,
"calib/step_q_gap": 0.1559689065723548,
"calib/step_q_w": 0.46211538461538465,
"calib/step_q_w_n": 312.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1938.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 454.4609375,
"completions/mean_terminated_length": 456.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.1952,
"grad_norm": 0.004665024112910032,
"kl": 0.200042724609375,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0071,
"num_tokens": 40421551.0,
"reward": 0.3409632444381714,
"reward_std": 0.260761559009552,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7225355505943298,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.32967162132263184,
"step": 183
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7162499999999999,
"calib/avg_num_step_conf": 2.32421875,
"calib/ece": 0.2200395256916996,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": 0.24769499999999994,
"calib/mean_conf": 0.6069960474308301,
"calib/mu_c": 0.729375,
"calib/mu_w": 0.48168000000000005,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.16055335968379447,
"calib/std_conf": 0.3518978303262607,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6707046070460705,
"calib/step_q_c_n": 246.0,
"calib/step_q_gap": 0.1916146356993656,
"calib/step_q_w": 0.4790899713467049,
"calib/step_q_w_n": 349.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1910.0,
"completions/max_terminated_length": 1910.0,
"completions/mean_length": 423.39453125,
"completions/mean_terminated_length": 425.054931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0054717957973480225,
"kl": 0.2116241455078125,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0021,
"num_tokens": 40635220.0,
"reward": 0.33307600021362305,
"reward_std": 0.28628167510032654,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7161625027656555,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3445417284965515,
"step": 184
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7125305457507468,
"calib/avg_num_step_conf": 2.2578125,
"calib/ece": 0.2018518518518519,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.3292181069958848,
"calib/gap": 0.262407683953299,
"calib/mean_conf": 0.6115637860082305,
"calib/mu_c": 0.7487068965517242,
"calib/mu_w": 0.4862992125984252,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.16802469135802472,
"calib/std_conf": 0.35169785574241746,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6936652835408023,
"calib/step_q_c_n": 241.0,
"calib/step_q_gap": 0.26402196009866574,
"calib/step_q_w": 0.4296433234421365,
"calib/step_q_w_n": 337.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 465.0234375,
"completions/mean_terminated_length": 474.286865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.004754678346216679,
"kl": 0.1771392822265625,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0247,
"num_tokens": 40861186.0,
"reward": 0.3285626769065857,
"reward_std": 0.29156944155693054,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6866241693496704,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": -0.3068425953388214,
"step": 185
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.681418561607615,
"calib/avg_num_step_conf": 2.22265625,
"calib/ece": 0.21615176151761523,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.32113821138211385,
"calib/gap": 0.2182372642340914,
"calib/mean_conf": 0.5773712737127371,
"calib/mu_c": 0.6873770491803279,
"calib/mu_w": 0.46913978494623654,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1487940379403795,
"calib/std_conf": 0.3578489515041175,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7136086956521739,
"calib/step_q_c_n": 230.0,
"calib/step_q_gap": 0.2396263947672181,
"calib/step_q_w": 0.4739823008849558,
"calib/step_q_w_n": 339.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2429.0,
"completions/max_terminated_length": 2429.0,
"completions/mean_length": 421.35546875,
"completions/mean_terminated_length": 429.7490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.1984,
"grad_norm": 0.005402509588748217,
"kl": 0.21337890625,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0002,
"num_tokens": 41074093.0,
"reward": 0.3318336606025696,
"reward_std": 0.24953043460845947,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6887925863265991,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3102814555168152,
"step": 186
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7013105451597649,
"calib/avg_num_step_conf": 3.12109375,
"calib/ece": 0.17158469945355187,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.26639344262295084,
"calib/gap": 0.2301821702807988,
"calib/mean_conf": 0.5464480874316939,
"calib/mu_c": 0.6700294985250737,
"calib/mu_w": 0.43984732824427486,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.1274590163934426,
"calib/std_conf": 0.33904135658692974,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6229629629629629,
"calib/step_q_c_n": 243.0,
"calib/step_q_gap": 0.28879947375432985,
"calib/step_q_w": 0.3341634892086331,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2506.0,
"completions/max_terminated_length": 2506.0,
"completions/mean_length": 509.32421875,
"completions/mean_terminated_length": 517.4087524414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.0040683439001441,
"kl": 0.1923675537109375,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0137,
"num_tokens": 41306024.0,
"reward": 0.33687639236450195,
"reward_std": 0.23229585587978363,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.6975290775299072,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.3003388047218323,
"step": 187
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6816679379608441,
"calib/avg_num_step_conf": 2.8515625,
"calib/ece": 0.2061111111111112,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3134920634920635,
"calib/gap": 0.2120747520976355,
"calib/mean_conf": 0.5808730158730159,
"calib/mu_c": 0.6768115942028986,
"calib/mu_w": 0.4647368421052631,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11968253968253975,
"calib/std_conf": 0.3575712157755738,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6020120401337793,
"calib/step_q_c_n": 299.0,
"calib/step_q_gap": 0.18791149875713586,
"calib/step_q_w": 0.4141005413766434,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 501.99609375,
"completions/mean_terminated_length": 501.99609375,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.0044453563168644905,
"kl": 0.183319091796875,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0118,
"num_tokens": 41538607.0,
"reward": 0.33286768198013306,
"reward_std": 0.27241426706314087,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7093316316604614,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3459400534629822,
"step": 188
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7370574741149482,
"calib/avg_num_step_conf": 1.82421875,
"calib/ece": 0.14645669291338584,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2677165354330709,
"calib/gap": 0.2864566929133858,
"calib/mean_conf": 0.5159842519685041,
"calib/mu_c": 0.6592125984251969,
"calib/mu_w": 0.37275590551181104,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.08122047244094491,
"calib/std_conf": 0.3521458624113264,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6631730769230769,
"calib/step_q_c_n": 208.0,
"calib/step_q_gap": 0.24314489159489167,
"calib/step_q_w": 0.42002818532818526,
"calib/step_q_w_n": 259.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1081.0,
"completions/max_terminated_length": 1081.0,
"completions/mean_length": 396.578125,
"completions/mean_terminated_length": 398.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.2016,
"grad_norm": 0.005412383936345577,
"kl": 0.22686767578125,
"learning_rate": 3.055555555555556e-07,
"loss": 0.015,
"num_tokens": 41747899.0,
"reward": 0.360355019569397,
"reward_std": 0.2340199053287506,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.759178876876831,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3353438377380371,
"step": 189
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7228691197019097,
"calib/avg_num_step_conf": 2.4296875,
"calib/ece": 0.1691463414634146,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.2682926829268293,
"calib/gap": 0.2751380664049505,
"calib/mean_conf": 0.5415853658536586,
"calib/mu_c": 0.6679699248120301,
"calib/mu_w": 0.3928318584070796,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.085040650406504,
"calib/std_conf": 0.35661280104217036,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.621969111969112,
"calib/step_q_c_n": 259.0,
"calib/step_q_gap": 0.20846745907655007,
"calib/step_q_w": 0.4135016528925619,
"calib/step_q_w_n": 363.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1922.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 473.79296875,
"completions/mean_terminated_length": 481.3135070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.004683112259954214,
"kl": 0.18890380859375,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0779,
"num_tokens": 41974798.0,
"reward": 0.36603257060050964,
"reward_std": 0.24707284569740295,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7150722742080688,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.27519458532333374,
"step": 190
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6186642814549792,
"calib/avg_num_step_conf": 2.4375,
"calib/ece": 0.22776422764227644,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.32113821138211385,
"calib/gap": 0.16807195388590723,
"calib/mean_conf": 0.5994715447154471,
"calib/mu_c": 0.6876068376068375,
"calib/mu_w": 0.5195348837209303,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.17581300813008133,
"calib/std_conf": 0.34839254155540555,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6482170542635659,
"calib/step_q_c_n": 258.0,
"calib/step_q_gap": 0.1743445588173,
"calib/step_q_w": 0.4738724954462659,
"calib/step_q_w_n": 366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1973.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 411.7421875,
"completions/mean_terminated_length": 418.2778015136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.005159609951078892,
"kl": 0.217193603515625,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0282,
"num_tokens": 42184372.0,
"reward": 0.29070740938186646,
"reward_std": 0.24974632263183594,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6620468497276306,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3618820309638977,
"step": 191
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7324285339627592,
"calib/avg_num_step_conf": 2.3671875,
"calib/ece": 0.17230769230769227,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.29554655870445345,
"calib/gap": 0.2744623655913981,
"calib/mean_conf": 0.5644534412955466,
"calib/mu_c": 0.7011290322580648,
"calib/mu_w": 0.4266666666666667,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.11736842105263155,
"calib/std_conf": 0.35029369525228626,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6664016736401674,
"calib/step_q_c_n": 239.0,
"calib/step_q_gap": 0.22390467091537175,
"calib/step_q_w": 0.4424970027247957,
"calib/step_q_w_n": 367.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2284.0,
"completions/max_terminated_length": 2284.0,
"completions/mean_length": 467.73828125,
"completions/mean_terminated_length": 471.4212646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.2048,
"grad_norm": 0.005066120997071266,
"kl": 0.1961669921875,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0182,
"num_tokens": 42409089.0,
"reward": 0.35959967970848083,
"reward_std": 0.2445959597826004,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7285863161087036,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.2976681590080261,
"step": 192
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5780069819105046,
"calib/avg_num_step_conf": 2.34375,
"calib/ece": 0.28694444444444445,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.25396825396825395,
"calib/gap": 0.09550682323072035,
"calib/mean_conf": 0.5196428571428572,
"calib/mu_c": 0.5715652173913043,
"calib/mu_w": 0.47605839416058393,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1751190476190476,
"calib/std_conf": 0.3572828892140933,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.614022038567493,
"calib/step_q_c_n": 242.0,
"calib/step_q_gap": 0.16272706650045393,
"calib/step_q_w": 0.4512949720670391,
"calib/step_q_w_n": 358.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 451.8203125,
"completions/mean_terminated_length": 457.1778869628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.0044546108692884445,
"kl": 0.190399169921875,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0066,
"num_tokens": 42630467.0,
"reward": 0.288581907749176,
"reward_std": 0.2478843331336975,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6464488506317139,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.35209745168685913,
"step": 193
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7408296523910121,
"calib/avg_num_step_conf": 2.4921875,
"calib/ece": 0.15487160000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.292,
"calib/gap": 0.298392017156392,
"calib/mean_conf": 0.5328883999999999,
"calib/mu_c": 0.6844715447154471,
"calib/mu_w": 0.3860795275590551,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09788000000000006,
"calib/std_conf": 0.3563361373555032,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5929262295081966,
"calib/step_q_c_n": 244.0,
"calib/step_q_gap": 0.18516353915286665,
"calib/step_q_w": 0.40776269035532997,
"calib/step_q_w_n": 394.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2507.0,
"completions/max_terminated_length": 2507.0,
"completions/mean_length": 442.453125,
"completions/mean_terminated_length": 444.1882629394531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.005256724078208208,
"kl": 0.1939239501953125,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0259,
"num_tokens": 42849679.0,
"reward": 0.38366755843162537,
"reward_std": 0.22829414904117584,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7520594596862793,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.2761306166648865,
"step": 194
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6435240762971857,
"calib/avg_num_step_conf": 2.9765625,
"calib/ece": 0.2221024489795918,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.2653061224489796,
"calib/gap": 0.17944061624649865,
"calib/mean_conf": 0.5359383673469387,
"calib/mu_c": 0.6230952380952381,
"calib/mu_w": 0.4436546218487395,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12187755102040815,
"calib/std_conf": 0.35479391131689186,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6177318295739348,
"calib/step_q_c_n": 266.0,
"calib/step_q_gap": 0.23394150699328958,
"calib/step_q_w": 0.3837903225806452,
"calib/step_q_w_n": 496.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 453.1875,
"completions/mean_terminated_length": 460.3809814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.208,
"grad_norm": 0.0048271650448441505,
"kl": 0.201446533203125,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0325,
"num_tokens": 43071679.0,
"reward": 0.3020411729812622,
"reward_std": 0.25643521547317505,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6751362681388855,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.3601164221763611,
"step": 195
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6687609402350587,
"calib/avg_num_step_conf": 1.9296875,
"calib/ece": 0.2553359683794467,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.43478260869565216,
"calib/gap": 0.19564766191547878,
"calib/mean_conf": 0.6760474308300395,
"calib/mu_c": 0.771937984496124,
"calib/mu_w": 0.5762903225806453,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.21075098814229257,
"calib/std_conf": 0.34119556961169406,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6745349794238684,
"calib/step_q_c_n": 243.0,
"calib/step_q_gap": 0.10173298739199588,
"calib/step_q_w": 0.5728019920318725,
"calib/step_q_w_n": 251.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1446.0,
"completions/max_terminated_length": 1446.0,
"completions/mean_length": 353.94921875,
"completions/mean_terminated_length": 355.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.005029777064919472,
"kl": 0.22454833984375,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0013,
"num_tokens": 43264834.0,
"reward": 0.31094813346862793,
"reward_std": 0.2894595265388489,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6839672327041626,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3573834300041199,
"step": 196
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7118863049095607,
"calib/avg_num_step_conf": 2.53125,
"calib/ece": 0.16815261044176702,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.28112449799196787,
"calib/gap": 0.2436937984496123,
"calib/mean_conf": 0.5722489959839357,
"calib/mu_c": 0.6984999999999999,
"calib/mu_w": 0.4548062015503876,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.12923694779116462,
"calib/std_conf": 0.3316754386980413,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.664102564102564,
"calib/step_q_c_n": 234.0,
"calib/step_q_gap": 0.2018803418803417,
"calib/step_q_w": 0.46222222222222226,
"calib/step_q_w_n": 414.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2145.0,
"completions/max_terminated_length": 2145.0,
"completions/mean_length": 451.1953125,
"completions/mean_terminated_length": 452.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.004593154415488243,
"kl": 0.219757080078125,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0146,
"num_tokens": 43485396.0,
"reward": 0.3464347720146179,
"reward_std": 0.23981933295726776,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7292242050170898,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.3230733871459961,
"step": 197
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7505573770491805,
"calib/avg_num_step_conf": 2.3203125,
"calib/ece": 0.16368421052631582,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3157894736842105,
"calib/gap": 0.2978478688524591,
"calib/mean_conf": 0.5904048582995953,
"calib/mu_c": 0.7375200000000001,
"calib/mu_w": 0.43967213114754095,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12400809716599193,
"calib/std_conf": 0.3462310636700311,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7191450131233597,
"calib/step_q_c_n": 254.0,
"calib/step_q_gap": 0.21238030724100676,
"calib/step_q_w": 0.5067647058823529,
"calib/step_q_w_n": 340.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1913.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 403.33984375,
"completions/mean_terminated_length": 406.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.2112,
"grad_norm": 0.005886967293918133,
"kl": 0.215576171875,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0155,
"num_tokens": 43694035.0,
"reward": 0.365684449672699,
"reward_std": 0.22702525556087494,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7409449219703674,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.2986384928226471,
"step": 198
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7204724409448819,
"calib/avg_num_step_conf": 2.27734375,
"calib/ece": 0.20097165991902832,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3481781376518219,
"calib/gap": 0.24710695538057742,
"calib/mean_conf": 0.6190283400809716,
"calib/mu_c": 0.7460833333333333,
"calib/mu_w": 0.4989763779527559,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.167085020242915,
"calib/std_conf": 0.3413296624498007,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6792417061611375,
"calib/step_q_c_n": 211.0,
"calib/step_q_gap": 0.17360755981967413,
"calib/step_q_w": 0.5056341463414634,
"calib/step_q_w_n": 369.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1937.0,
"completions/max_terminated_length": 1937.0,
"completions/mean_length": 442.73046875,
"completions/mean_terminated_length": 451.5498046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.004877576604485512,
"kl": 0.2230224609375,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0493,
"num_tokens": 43911574.0,
"reward": 0.33585840463638306,
"reward_std": 0.28109073638916016,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7057578563690186,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.31919723749160767,
"step": 199
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7489893799246318,
"calib/avg_num_step_conf": 2.5078125,
"calib/ece": 0.16906557377049175,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.36065573770491804,
"calib/gap": 0.30168194587187397,
"calib/mean_conf": 0.6129836065573769,
"calib/mu_c": 0.7428057553956835,
"calib/mu_w": 0.44112380952380953,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.10618852459016385,
"calib/std_conf": 0.35251515450755383,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7124039653035935,
"calib/step_q_c_n": 269.0,
"calib/step_q_gap": 0.2959428392982317,
"calib/step_q_w": 0.41646112600536184,
"calib/step_q_w_n": 373.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1725.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 436.5703125,
"completions/mean_terminated_length": 445.2669372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.004993748385459185,
"kl": 0.1981201171875,
"learning_rate": 0.0,
"loss": -0.0456,
"num_tokens": 44131384.0,
"reward": 0.3842260241508484,
"reward_std": 0.22596809267997742,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7367645502090454,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.26753127574920654,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.008306001230375842,
"train_runtime": 11013.984,
"train_samples_per_second": 4.649,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 44131384,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}