Files
PureRL-1.5B-v7-s2-l2-maskoff/trainer_state.json
ModelHub XC 93878d06aa 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-maskoff
Source: Original Platform
2026-06-04 16:45:21 +08:00

9843 lines
384 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.006959581281989813,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0031,
"num_tokens": 229171.0,
"reward": 0.40509337186813354,
"reward_std": 0.17624244093894958,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.23456206917762756,
"step": 1
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.0071344017051160336,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0015,
"num_tokens": 458661.0,
"reward": 0.3357120156288147,
"reward_std": 0.18962696194648743,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.2669021487236023,
"step": 2
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4613862050456253,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.25074803149606306,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.37401574803149606,
"calib/gap": -0.003988191089640103,
"calib/mean_conf": 0.8885433070866141,
"calib/mu_c": 0.8870987654320989,
"calib/mu_w": 0.891086956521739,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25074803149606306,
"calib/std_conf": 0.04568641072021581,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8050111576011157,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.05370680977502895,
"calib/step_q_w": 0.7513043478260868,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2369.0,
"completions/max_terminated_length": 2369.0,
"completions/mean_length": 497.15234375,
"completions/mean_terminated_length": 497.15234375,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.0032,
"grad_norm": 0.0076317982748150826,
"kl": 0.001775592565536499,
"learning_rate": 7.5e-07,
"loss": 0.0464,
"num_tokens": 691188.0,
"reward": 0.3898078203201294,
"reward_std": 0.17493298649787903,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.694400429725647,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.23822227120399475,
"step": 3
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.44732205778717404,
"calib/avg_num_step_conf": 4.9375,
"calib/ece": 0.22003984063745022,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.24302788844621515,
"calib/gap": -0.004470049330514536,
"calib/mean_conf": 0.8774103585657371,
"calib/mu_c": 0.8758787878787879,
"calib/mu_w": 0.8803488372093025,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22003984063745022,
"calib/std_conf": 0.047352906365984486,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7971202916160388,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.008820971888147677,
"calib/step_q_w": 0.7882993197278911,
"calib/step_q_w_n": 441.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2366.0,
"completions/max_terminated_length": 2366.0,
"completions/mean_length": 522.60546875,
"completions/mean_terminated_length": 522.60546875,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.007802617270499468,
"kl": 0.0003025531768798828,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0305,
"num_tokens": 931143.0,
"reward": 0.38953256607055664,
"reward_std": 0.18929244577884674,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7030750513076782,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.24744734168052673,
"step": 4
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.47041553748870824,
"calib/avg_num_step_conf": 4.9375,
"calib/ece": 0.3760240963855421,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.30120481927710846,
"calib/gap": -0.005913666279519836,
"calib/mean_conf": 0.8820481927710844,
"calib/mu_c": 0.8791269841269842,
"calib/mu_w": 0.885040650406504,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3760240963855421,
"calib/std_conf": 0.043853325339365874,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7959670164917543,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.010941890863613635,
"calib/step_q_w": 0.7850251256281406,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2906.0,
"completions/max_terminated_length": 2906.0,
"completions/mean_length": 543.76953125,
"completions/mean_terminated_length": 545.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.006589222699403763,
"kl": 0.00029391050338745117,
"learning_rate": 1.25e-06,
"loss": 0.0478,
"num_tokens": 1177036.0,
"reward": 0.29454368352890015,
"reward_std": 0.16011402010917664,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5850351452827454,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.287354052066803,
"step": 5
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5200569440050617,
"calib/avg_num_step_conf": 5.375,
"calib/ece": 0.30322834645669294,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2677165354330709,
"calib/gap": 0.0029939892439100335,
"calib/mean_conf": 0.8740944881889764,
"calib/mu_c": 0.8753793103448275,
"calib/mu_w": 0.8723853211009175,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30322834645669294,
"calib/std_conf": 0.046063194023381486,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7994713656387666,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.01912604189775935,
"calib/step_q_w": 0.7803453237410073,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3036.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 450.3828125,
"completions/mean_terminated_length": 450.3828125,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.0064,
"grad_norm": 0.009682250209152699,
"kl": 0.0006236135959625244,
"learning_rate": 1.5e-06,
"loss": 0.015,
"num_tokens": 1398286.0,
"reward": 0.3437407612800598,
"reward_std": 0.16819913685321808,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6572445034980774,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.28148171305656433,
"step": 6
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4516723356009071,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.220952380952381,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.32936507936507936,
"calib/gap": -0.010833333333333584,
"calib/mean_conf": 0.8801587301587303,
"calib/mu_c": 0.876547619047619,
"calib/mu_w": 0.8873809523809526,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21722222222222226,
"calib/std_conf": 0.0510210682748932,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7923841807909604,
"calib/step_q_c_n": 885.0,
"calib/step_q_gap": 0.010961335099577751,
"calib/step_q_w": 0.7814228456913827,
"calib/step_q_w_n": 499.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2403.0,
"completions/max_terminated_length": 2403.0,
"completions/mean_length": 533.61328125,
"completions/mean_terminated_length": 539.9407348632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.007666449528187513,
"kl": 0.000966191291809082,
"learning_rate": 1.75e-06,
"loss": 0.0273,
"num_tokens": 1642315.0,
"reward": 0.39989861845970154,
"reward_std": 0.17968562245368958,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7019117474555969,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.22789573669433594,
"step": 7
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.45037819509650495,
"calib/avg_num_step_conf": 4.8828125,
"calib/ece": 0.3014000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.296,
"calib/gap": 0.010517736045905002,
"calib/mean_conf": 0.8694000000000001,
"calib/mu_c": 0.873943661971831,
"calib/mu_w": 0.863425925925926,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3014000000000001,
"calib/std_conf": 0.09368479065462014,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8057534246575342,
"calib/step_q_c_n": 657.0,
"calib/step_q_gap": 0.0400535932915983,
"calib/step_q_w": 0.7656998313659359,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 536.734375,
"completions/mean_terminated_length": 540.9606323242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.007007604464888573,
"kl": 0.0007126033306121826,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0149,
"num_tokens": 1886231.0,
"reward": 0.34858155250549316,
"reward_std": 0.17785796523094177,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6407878398895264,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.24909357726573944,
"step": 8
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5257313472893184,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.2355905511811025,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.23622047244094488,
"calib/gap": 0.007353730542136394,
"calib/mean_conf": 0.8733858267716536,
"calib/mu_c": 0.8760493827160495,
"calib/mu_w": 0.8686956521739131,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2355905511811025,
"calib/std_conf": 0.048011161308004424,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7510578105781059,
"calib/step_q_c_n": 813.0,
"calib/step_q_gap": -0.04105355833140689,
"calib/step_q_w": 0.7921113689095128,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2985.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 511.203125,
"completions/mean_terminated_length": 513.2078857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0096,
"grad_norm": 0.006651603616774082,
"kl": 0.0003667175769805908,
"learning_rate": 2.25e-06,
"loss": 0.0019,
"num_tokens": 2124635.0,
"reward": 0.37856239080429077,
"reward_std": 0.18048423528671265,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7026296854019165,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.2673799693584442,
"step": 9
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.44472446236559143,
"calib/avg_num_step_conf": 5.04296875,
"calib/ece": 0.24976284584980227,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2885375494071146,
"calib/gap": -0.007983870967742002,
"calib/mean_conf": 0.8804347826086957,
"calib/mu_c": 0.8774999999999998,
"calib/mu_w": 0.8854838709677418,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24889328063241095,
"calib/std_conf": 0.04371359236849638,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7954461942257218,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.005143736758803086,
"calib/step_q_w": 0.7903024574669187,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2637.0,
"completions/max_terminated_length": 2637.0,
"completions/mean_length": 514.5625,
"completions/mean_terminated_length": 514.5625,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.007601493038237095,
"kl": 0.0004711151123046875,
"learning_rate": 2.5e-06,
"loss": 0.0195,
"num_tokens": 2363163.0,
"reward": 0.3792797327041626,
"reward_std": 0.18951097130775452,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6921863555908203,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.25628310441970825,
"step": 10
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.45581132785548517,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.2950390624999998,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.21875,
"calib/gap": -0.010393903280436545,
"calib/mean_conf": 0.8705859375,
"calib/mu_c": 0.866241610738255,
"calib/mu_w": 0.8766355140186916,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2917968749999998,
"calib/std_conf": 0.05131140031460937,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7901703800786369,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": 0.003828916664002757,
"calib/step_q_w": 0.7863414634146342,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1398.0,
"completions/max_terminated_length": 1398.0,
"completions/mean_length": 504.53125,
"completions/mean_terminated_length": 506.50982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.006837381515651941,
"kl": 0.0004589557647705078,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0013,
"num_tokens": 2596803.0,
"reward": 0.3525460362434387,
"reward_std": 0.14610227942466736,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6657754182815552,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.27708953619003296,
"step": 11
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.45588134076674236,
"calib/avg_num_step_conf": 5.42578125,
"calib/ece": 0.22531496062992135,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2795275590551181,
"calib/gap": -0.00828205657650205,
"calib/mean_conf": 0.8769685039370079,
"calib/mu_c": 0.8741317365269462,
"calib/mu_w": 0.8824137931034483,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2224015748031497,
"calib/std_conf": 0.04995138376743412,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7984235294117646,
"calib/step_q_c_n": 850.0,
"calib/step_q_gap": 0.022968983957219136,
"calib/step_q_w": 0.7754545454545455,
"calib/step_q_w_n": 539.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2436.0,
"completions/max_terminated_length": 2436.0,
"completions/mean_length": 484.28515625,
"completions/mean_terminated_length": 484.28515625,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.0128,
"grad_norm": 0.007849551737308502,
"kl": 0.0010522007942199707,
"learning_rate": 3e-06,
"loss": 0.0313,
"num_tokens": 2824956.0,
"reward": 0.4141455888748169,
"reward_std": 0.17251336574554443,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7109042406082153,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.21073806285858154,
"step": 12
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5100972070098576,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.23007874015748023,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2992125984251969,
"calib/gap": 0.00084200438116111,
"calib/mean_conf": 0.8795275590551181,
"calib/mu_c": 0.8798192771084338,
"calib/mu_w": 0.8789772727272727,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22803149606299206,
"calib/std_conf": 0.04864070858691373,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7896358907672302,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": -0.0006418870105476815,
"calib/step_q_w": 0.7902777777777779,
"calib/step_q_w_n": 468.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2676.0,
"completions/max_terminated_length": 2676.0,
"completions/mean_length": 504.5703125,
"completions/mean_terminated_length": 504.5703125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.00747161079198122,
"kl": 0.0017485618591308594,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0506,
"num_tokens": 3058718.0,
"reward": 0.40975651144981384,
"reward_std": 0.1711602509021759,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7134867310523987,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.22131745517253876,
"step": 13
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4449854266030736,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.32983805668016203,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3076923076923077,
"calib/gap": -0.00949920508744051,
"calib/mean_conf": 0.8804453441295548,
"calib/mu_c": 0.8761764705882352,
"calib/mu_w": 0.8856756756756757,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.32983805668016203,
"calib/std_conf": 0.046114231169776115,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.804,
"calib/step_q_c_n": 655.0,
"calib/step_q_gap": 0.015983471074380362,
"calib/step_q_w": 0.7880165289256197,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2528.0,
"completions/max_terminated_length": 2528.0,
"completions/mean_length": 571.23828125,
"completions/mean_terminated_length": 573.4784545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.0067930989898741245,
"kl": 0.002413034439086914,
"learning_rate": 3.5e-06,
"loss": 0.0179,
"num_tokens": 3310355.0,
"reward": 0.33452653884887695,
"reward_std": 0.19036152958869934,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6139481067657471,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.24333252012729645,
"step": 14
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5114159512255817,
"calib/avg_num_step_conf": 4.92578125,
"calib/ece": 0.33219607843137255,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.37254901960784315,
"calib/gap": 0.0015173572228442955,
"calib/mean_conf": 0.8851372549019607,
"calib/mu_c": 0.8858156028368794,
"calib/mu_w": 0.8842982456140351,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33219607843137255,
"calib/std_conf": 0.044772636779947896,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7979115853658537,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.004985965531143055,
"calib/step_q_w": 0.7929256198347107,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1052.0,
"completions/max_terminated_length": 1052.0,
"completions/mean_length": 488.13671875,
"completions/mean_terminated_length": 490.0509948730469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.016,
"grad_norm": 0.006855354178696871,
"kl": 0.005002260208129883,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0159,
"num_tokens": 3543198.0,
"reward": 0.3387652635574341,
"reward_std": 0.17916326224803925,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6386894583702087,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.27053388953208923,
"step": 15
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.44479110146500267,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.28172690763052205,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.41365461847389556,
"calib/gap": 0.002976804123711152,
"calib/mean_conf": 0.8875903614457831,
"calib/mu_c": 0.8887499999999998,
"calib/mu_w": 0.8857731958762887,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.279437751004016,
"calib/std_conf": 0.07968699351343808,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.768563656147987,
"calib/step_q_c_n": 919.0,
"calib/step_q_gap": -0.016130933729848862,
"calib/step_q_w": 0.7846945898778359,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2768.0,
"completions/max_terminated_length": 2768.0,
"completions/mean_length": 679.59765625,
"completions/mean_terminated_length": 682.2627563476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.006001787725836039,
"kl": 0.005553245544433594,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0197,
"num_tokens": 3826023.0,
"reward": 0.358783483505249,
"reward_std": 0.1683286428451538,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.654166042804718,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.2490990310907364,
"step": 16
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4394421335943235,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.13988,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.244,
"calib/gap": -0.005791534132615572,
"calib/mean_conf": 0.8718800000000001,
"calib/mu_c": 0.870327868852459,
"calib/mu_w": 0.8761194029850746,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.13988,
"calib/std_conf": 0.05323969947323144,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7790087241003272,
"calib/step_q_c_n": 917.0,
"calib/step_q_gap": 0.000997674376570279,
"calib/step_q_w": 0.7780110497237569,
"calib/step_q_w_n": 362.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2914.0,
"completions/max_terminated_length": 2914.0,
"completions/mean_length": 546.64453125,
"completions/mean_terminated_length": 550.9487915039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.0068136779591441154,
"kl": 0.00926065444946289,
"learning_rate": 4.25e-06,
"loss": 0.0026,
"num_tokens": 4069492.0,
"reward": 0.45704808831214905,
"reward_std": 0.19913692772388458,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7529730796813965,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": -0.17325183749198914,
"step": 17
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.464132475194422,
"calib/avg_num_step_conf": 4.50390625,
"calib/ece": 0.34877551020408165,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.3510204081632653,
"calib/gap": -0.008856261732367976,
"calib/mean_conf": 0.8793877551020409,
"calib/mu_c": 0.8753030303030302,
"calib/mu_w": 0.8841592920353982,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.34469387755102043,
"calib/std_conf": 0.07845467073582399,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.7885454545454545,
"calib/step_q_c_n": 550.0,
"calib/step_q_gap": 0.07280747776270158,
"calib/step_q_w": 0.715737976782753,
"calib/step_q_w_n": 603.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 570.30078125,
"completions/mean_terminated_length": 574.7913208007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.0192,
"grad_norm": 0.006491835694760084,
"kl": 0.013482093811035156,
"learning_rate": 4.5e-06,
"loss": -0.0022,
"num_tokens": 4326209.0,
"reward": 0.2919595241546631,
"reward_std": 0.14282935857772827,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5701101422309875,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": -0.27290987968444824,
"step": 18
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5346534653465347,
"calib/avg_num_step_conf": 4.4453125,
"calib/ece": 0.28625498007968125,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.38247011952191234,
"calib/gap": 0.01972079207920774,
"calib/mean_conf": 0.8838645418326693,
"calib/mu_c": 0.8917999999999999,
"calib/mu_w": 0.8720792079207922,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.28625498007968125,
"calib/std_conf": 0.0766985976284869,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.789051987767584,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.056799921651881524,
"calib/step_q_w": 0.7322520661157025,
"calib/step_q_w_n": 484.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1491.0,
"completions/max_terminated_length": 1491.0,
"completions/mean_length": 542.04296875,
"completions/mean_terminated_length": 548.4703979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.007006355561316013,
"kl": 0.020032882690429688,
"learning_rate": 4.75e-06,
"loss": 0.0123,
"num_tokens": 4569732.0,
"reward": 0.3702203035354614,
"reward_std": 0.1917993575334549,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6583890914916992,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.22810472548007965,
"step": 19
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49956510103037605,
"calib/avg_num_step_conf": 4.80859375,
"calib/ece": 0.31429149797570843,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3522267206477733,
"calib/gap": -0.00024755787501651305,
"calib/mean_conf": 0.8851417004048583,
"calib/mu_c": 0.8850354609929078,
"calib/mu_w": 0.8852830188679243,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31429149797570843,
"calib/std_conf": 0.048096143611593256,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.794763948497854,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.0019443996256736096,
"calib/step_q_w": 0.7928195488721804,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 565.83203125,
"completions/mean_terminated_length": 565.83203125,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.01030268706381321,
"kl": 0.0440673828125,
"learning_rate": 5e-06,
"loss": 0.0425,
"num_tokens": 4819457.0,
"reward": 0.3507111668586731,
"reward_std": 0.17901304364204407,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6296054720878601,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.2305269092321396,
"step": 20
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5494007704380083,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.23751004016064262,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3534136546184739,
"calib/gap": 0.011117848480524994,
"calib/mean_conf": 0.8853012048192771,
"calib/mu_c": 0.8891411042944786,
"calib/mu_w": 0.8780232558139536,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23409638554216872,
"calib/std_conf": 0.054728332700301,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7738012422360248,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.009273345240316644,
"calib/step_q_w": 0.7645278969957081,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2869.0,
"completions/max_terminated_length": 2869.0,
"completions/mean_length": 582.05078125,
"completions/mean_terminated_length": 582.05078125,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.0224,
"grad_norm": 0.006925127934664488,
"kl": 0.023862838745117188,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0254,
"num_tokens": 5071422.0,
"reward": 0.41713836789131165,
"reward_std": 0.21328508853912354,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7029625177383423,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.19056078791618347,
"step": 21
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5403197031328053,
"calib/avg_num_step_conf": 4.39453125,
"calib/ece": 0.1998031496062992,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3346456692913386,
"calib/gap": 0.009304931135374561,
"calib/mean_conf": 0.8809055118110235,
"calib/mu_c": 0.8838728323699423,
"calib/mu_w": 0.8745679012345677,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1998031496062992,
"calib/std_conf": 0.04911386947028659,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7907911802853437,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.0250002198333662,
"calib/step_q_w": 0.7657909604519775,
"calib/step_q_w_n": 354.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2940.0,
"completions/max_terminated_length": 2940.0,
"completions/mean_length": 542.12109375,
"completions/mean_terminated_length": 542.12109375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.007413065526634455,
"kl": 0.030803680419921875,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0027,
"num_tokens": 5312021.0,
"reward": 0.4359588921070099,
"reward_std": 0.12823528051376343,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7348839640617371,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.19577866792678833,
"step": 22
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5566195115441078,
"calib/avg_num_step_conf": 4.09765625,
"calib/ece": 0.3977551020408163,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.24897959183673468,
"calib/gap": 0.005119444815160912,
"calib/mean_conf": 0.8751428571428572,
"calib/mu_c": 0.8777966101694916,
"calib/mu_w": 0.8726771653543307,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.3956326530612245,
"calib/std_conf": 0.05291194070706304,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.7744303797468355,
"calib/step_q_c_n": 474.0,
"calib/step_q_gap": -0.004352228948816594,
"calib/step_q_w": 0.7787826086956521,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2999.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 583.55078125,
"completions/mean_terminated_length": 583.55078125,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.007403541821986437,
"kl": 0.03454780578613281,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.004,
"num_tokens": 5565346.0,
"reward": 0.2809341549873352,
"reward_std": 0.24287301301956177,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5553504228591919,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l2_reward": -0.26848214864730835,
"step": 23
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.516395618984401,
"calib/avg_num_step_conf": 3.78515625,
"calib/ece": 0.3439024390243901,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.2967479674796748,
"calib/gap": 0.001826750746763972,
"calib/mean_conf": 0.8760162601626016,
"calib/mu_c": 0.8768702290076337,
"calib/mu_w": 0.8750434782608697,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.3436991869918698,
"calib/std_conf": 0.05450053833400678,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.7803929273084479,
"calib/step_q_c_n": 509.0,
"calib/step_q_gap": 0.008936405569317407,
"calib/step_q_w": 0.7714565217391305,
"calib/step_q_w_n": 460.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2609.0,
"completions/max_terminated_length": 2609.0,
"completions/mean_length": 577.32421875,
"completions/mean_terminated_length": 581.8700561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.0256,
"grad_norm": 0.006647349335253239,
"kl": 0.030107498168945312,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0012,
"num_tokens": 5817653.0,
"reward": 0.32674214243888855,
"reward_std": 0.22721973061561584,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5986887216567993,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.23817317187786102,
"step": 24
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5208709612683123,
"calib/avg_num_step_conf": 3.76953125,
"calib/ece": 0.26552,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.28,
"calib/gap": 0.01258411933908632,
"calib/mean_conf": 0.86952,
"calib/mu_c": 0.8745033112582782,
"calib/mu_w": 0.8619191919191919,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.26552,
"calib/std_conf": 0.06923416497654897,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.7856052141527002,
"calib/step_q_c_n": 537.0,
"calib/step_q_gap": 0.02074540106858802,
"calib/step_q_w": 0.7648598130841122,
"calib/step_q_w_n": 428.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 522.05859375,
"completions/mean_terminated_length": 524.1058959960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.006916355807334185,
"kl": 0.03505706787109375,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0308,
"num_tokens": 6054524.0,
"reward": 0.3672490119934082,
"reward_std": 0.1614658534526825,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6542297005653381,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": -0.22754418849945068,
"step": 25
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.4935005298481102,
"calib/avg_num_step_conf": 3.5234375,
"calib/ece": 0.26204918032786867,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.2827868852459016,
"calib/gap": 0.004257152949487919,
"calib/mean_conf": 0.8727049180327868,
"calib/mu_c": 0.8743624161073825,
"calib/mu_w": 0.8701052631578946,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.26204918032786867,
"calib/std_conf": 0.054765324754675604,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8019260700389106,
"calib/step_q_c_n": 514.0,
"calib/step_q_gap": 0.018008544265714743,
"calib/step_q_w": 0.7839175257731958,
"calib/step_q_w_n": 388.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2383.0,
"completions/max_terminated_length": 2383.0,
"completions/mean_length": 551.98046875,
"completions/mean_terminated_length": 554.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.006535641383379698,
"kl": 0.028377532958984375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0319,
"num_tokens": 6301071.0,
"reward": 0.3780941367149353,
"reward_std": 0.1892511248588562,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6601343154907227,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.21097734570503235,
"step": 26
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5303893637226972,
"calib/avg_num_step_conf": 3.546875,
"calib/ece": 0.3560082304526748,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.1934156378600823,
"calib/gap": -0.00423076923076926,
"calib/mean_conf": 0.8625925925925927,
"calib/mu_c": 0.8605555555555555,
"calib/mu_w": 0.8647863247863248,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.3500411522633744,
"calib/std_conf": 0.0613183914524468,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.7823225806451612,
"calib/step_q_c_n": 465.0,
"calib/step_q_gap": 0.00038127139008226063,
"calib/step_q_w": 0.781941309255079,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2549.0,
"completions/max_terminated_length": 2549.0,
"completions/mean_length": 524.75390625,
"completions/mean_terminated_length": 535.2072143554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.0288,
"grad_norm": 0.0070977299474179745,
"kl": 0.036457061767578125,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0038,
"num_tokens": 6540624.0,
"reward": 0.30288949608802795,
"reward_std": 0.21201197803020477,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5817859172821045,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": -0.26116320490837097,
"step": 27
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.6194423223834988,
"calib/avg_num_step_conf": 3.0703125,
"calib/ece": 0.22330543933054398,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.25523012552301255,
"calib/gap": 0.0263071046600456,
"calib/mean_conf": 0.8676569037656906,
"calib/mu_c": 0.877012987012987,
"calib/mu_w": 0.8507058823529414,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.22330543933054398,
"calib/std_conf": 0.05225011725913397,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_c": 0.7906796116504855,
"calib/step_q_c_n": 515.0,
"calib/step_q_gap": 0.013447139325762247,
"calib/step_q_w": 0.7772324723247233,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 588.74609375,
"completions/mean_terminated_length": 595.727294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.006945778150111437,
"kl": 0.026153564453125,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0547,
"num_tokens": 6798287.0,
"reward": 0.3795027434825897,
"reward_std": 0.18767467141151428,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.643653929233551,
"rewards/format_reward_step": 0.88671875,
"rewards/step_l2_reward": -0.18230466544628143,
"step": 28
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5269381946326918,
"calib/avg_num_step_conf": 3.046875,
"calib/ece": 0.38341563786008237,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": 0.004739766874491624,
"calib/mean_conf": 0.8731275720164609,
"calib/mu_c": 0.8755462184873948,
"calib/mu_w": 0.8708064516129032,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.38341563786008237,
"calib/std_conf": 0.04838218243560813,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.792140921409214,
"calib/step_q_c_n": 369.0,
"calib/step_q_gap": 0.015279607540600981,
"calib/step_q_w": 0.776861313868613,
"calib/step_q_w_n": 411.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 616.796875,
"completions/mean_terminated_length": 616.796875,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.005464597605168819,
"kl": 0.026638031005859375,
"learning_rate": 4.75e-06,
"loss": 0.0254,
"num_tokens": 7063315.0,
"reward": 0.30837979912757874,
"reward_std": 0.17728251218795776,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.5679382681846619,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": -0.232428640127182,
"step": 29
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5185072610604524,
"calib/avg_num_step_conf": 3.21875,
"calib/ece": 0.2950000000000002,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.21951219512195122,
"calib/gap": 0.01468085106382977,
"calib/mean_conf": 0.8650813008130082,
"calib/mu_c": 0.8713475177304966,
"calib/mu_w": 0.8566666666666668,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.2934552845528457,
"calib/std_conf": 0.07664054982829054,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.7726666666666667,
"calib/step_q_c_n": 450.0,
"calib/step_q_gap": 0.024324420677361958,
"calib/step_q_w": 0.7483422459893048,
"calib/step_q_w_n": 374.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 609.2734375,
"completions/mean_terminated_length": 609.2734375,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.032,
"grad_norm": 0.0061422535218298435,
"kl": 0.033168792724609375,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0106,
"num_tokens": 7326273.0,
"reward": 0.33184754848480225,
"reward_std": 0.27306443452835083,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.614844560623169,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": -0.24568068981170654,
"step": 30
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6292620865139948,
"calib/avg_num_step_conf": 2.9765625,
"calib/ece": 0.3940239043824701,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": 0.029038167938931436,
"calib/mean_conf": 0.8688446215139443,
"calib/mu_c": 0.8840000000000001,
"calib/mu_w": 0.8549618320610687,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3923904382470119,
"calib/std_conf": 0.08539024409036974,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8020317585301837,
"calib/step_q_c_n": 381.0,
"calib/step_q_gap": 0.028094750656167955,
"calib/step_q_w": 0.7739370078740158,
"calib/step_q_w_n": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1760.0,
"completions/max_terminated_length": 1760.0,
"completions/mean_length": 581.9375,
"completions/mean_terminated_length": 584.2196655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.0060067446902394295,
"kl": 0.02797698974609375,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0063,
"num_tokens": 7581161.0,
"reward": 0.29306304454803467,
"reward_std": 0.19945606589317322,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5801421403884888,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.27995362877845764,
"step": 31
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6632510013351135,
"calib/avg_num_step_conf": 3.20703125,
"calib/ece": 0.30012145748987856,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.2591093117408907,
"calib/gap": 0.036122830440587395,
"calib/mean_conf": 0.8669230769230769,
"calib/mu_c": 0.8825714285714286,
"calib/mu_w": 0.8464485981308412,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.30012145748987856,
"calib/std_conf": 0.07387651197851379,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8015618221258134,
"calib/step_q_c_n": 461.0,
"calib/step_q_gap": 0.07678404434803554,
"calib/step_q_w": 0.7247777777777779,
"calib/step_q_w_n": 360.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 579.53515625,
"completions/mean_terminated_length": 581.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.0065063112415373325,
"kl": 0.030467987060546875,
"learning_rate": 4.666666666666667e-06,
"loss": 0.004,
"num_tokens": 7836226.0,
"reward": 0.36128664016723633,
"reward_std": 0.20347735285758972,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6428042650222778,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.22023098170757294,
"step": 32
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5393358876117497,
"calib/avg_num_step_conf": 2.97265625,
"calib/ece": 0.33298804780876473,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.24701195219123506,
"calib/gap": 0.008127075351213486,
"calib/mean_conf": 0.8708366533864541,
"calib/mu_c": 0.8745925925925926,
"calib/mu_w": 0.8664655172413791,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.33298804780876473,
"calib/std_conf": 0.04760914500809453,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7982195121951222,
"calib/step_q_c_n": 410.0,
"calib/step_q_gap": 0.015797859773469813,
"calib/step_q_w": 0.7824216524216524,
"calib/step_q_w_n": 351.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2723.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 513.703125,
"completions/mean_terminated_length": 519.7944946289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.0352,
"grad_norm": 0.006140530575066805,
"kl": 0.03408050537109375,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0129,
"num_tokens": 8074606.0,
"reward": 0.34142816066741943,
"reward_std": 0.17432644963264465,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6297796964645386,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.24848589301109314,
"step": 33
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.44686621789425524,
"calib/avg_num_step_conf": 3.3359375,
"calib/ece": 0.30467999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.284,
"calib/gap": -0.005585909417685109,
"calib/mean_conf": 0.8731599999999999,
"calib/mu_c": 0.8707692307692307,
"calib/mu_w": 0.8763551401869158,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.30291999999999997,
"calib/std_conf": 0.05450884698835594,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7898726114649681,
"calib/step_q_c_n": 471.0,
"calib/step_q_gap": 0.022274700237814082,
"calib/step_q_w": 0.767597911227154,
"calib/step_q_w_n": 383.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 520.2265625,
"completions/mean_terminated_length": 524.3228149414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 260.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.007740476168692112,
"kl": 0.03607749938964844,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0164,
"num_tokens": 8312896.0,
"reward": 0.3710823357105255,
"reward_std": 0.20131167769432068,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6339741945266724,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.1957157701253891,
"step": 34
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5440991735537191,
"calib/avg_num_step_conf": 2.4921875,
"calib/ece": 0.3594715447154472,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.21138211382113822,
"calib/gap": 0.011381818181818493,
"calib/mean_conf": 0.8676016260162602,
"calib/mu_c": 0.8732000000000002,
"calib/mu_w": 0.8618181818181817,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.3594715447154472,
"calib/std_conf": 0.054990204573015435,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.8078145695364237,
"calib/step_q_c_n": 302.0,
"calib/step_q_gap": 0.030820521917376165,
"calib/step_q_w": 0.7769940476190476,
"calib/step_q_w_n": 336.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 557.87890625,
"completions/mean_terminated_length": 566.734130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.006301682908087969,
"kl": 0.02927398681640625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0127,
"num_tokens": 8564969.0,
"reward": 0.3017095923423767,
"reward_std": 0.22571972012519836,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5847018957138062,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": -0.26722028851509094,
"step": 35
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5850586611456177,
"calib/avg_num_step_conf": 3.109375,
"calib/ece": 0.12700404858299594,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.242914979757085,
"calib/gap": 0.010705659075224139,
"calib/mean_conf": 0.8719433198380567,
"calib/mu_c": 0.8746739130434783,
"calib/mu_w": 0.8639682539682542,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.12700404858299594,
"calib/std_conf": 0.04224792865476782,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8208474576271185,
"calib/step_q_c_n": 590.0,
"calib/step_q_gap": 0.035701826559157324,
"calib/step_q_w": 0.7851456310679612,
"calib/step_q_w_n": 206.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2694.0,
"completions/max_terminated_length": 2694.0,
"completions/mean_length": 545.00390625,
"completions/mean_terminated_length": 545.00390625,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0384,
"grad_norm": 0.00643440755084157,
"kl": 0.039272308349609375,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0129,
"num_tokens": 8807202.0,
"reward": 0.4795789420604706,
"reward_std": 0.1412101686000824,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7681589722633362,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.14571987092494965,
"step": 36
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6778419779629132,
"calib/avg_num_step_conf": 2.5625,
"calib/ece": 0.36663934426229505,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.21311475409836064,
"calib/gap": 0.03360655737704932,
"calib/mean_conf": 0.8666393442622953,
"calib/mu_c": 0.8834426229508198,
"calib/mu_w": 0.8498360655737704,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.36663934426229505,
"calib/std_conf": 0.0658564093217231,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.8231741573033708,
"calib/step_q_c_n": 356.0,
"calib/step_q_gap": 0.047040823970037504,
"calib/step_q_w": 0.7761333333333333,
"calib/step_q_w_n": 300.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 585.0625,
"completions/mean_terminated_length": 589.6693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.00586958322674036,
"kl": 0.027469635009765625,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0286,
"num_tokens": 9064074.0,
"reward": 0.3125787377357483,
"reward_std": 0.1971425712108612,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.5849460959434509,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": -0.2418198585510254,
"step": 37
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6426985370950888,
"calib/avg_num_step_conf": 2.55859375,
"calib/ece": 0.33435483870967736,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.1975806451612903,
"calib/gap": 0.03169801462904909,
"calib/mean_conf": 0.8666129032258065,
"calib/mu_c": 0.8814393939393939,
"calib/mu_w": 0.8497413793103448,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.33435483870967736,
"calib/std_conf": 0.06886039051511297,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8236464088397789,
"calib/step_q_c_n": 362.0,
"calib/step_q_gap": 0.06634265457356736,
"calib/step_q_w": 0.7573037542662115,
"calib/step_q_w_n": 293.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2634.0,
"completions/max_terminated_length": 2634.0,
"completions/mean_length": 510.89453125,
"completions/mean_terminated_length": 516.9525756835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.007309312000870705,
"kl": 0.028207778930664062,
"learning_rate": 4.5e-06,
"loss": 0.0083,
"num_tokens": 9301751.0,
"reward": 0.33864691853523254,
"reward_std": 0.2210758626461029,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6288172006607056,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.24761712551116943,
"step": 38
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5811217948717949,
"calib/avg_num_step_conf": 2.38671875,
"calib/ece": 0.3505999999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.26,
"calib/gap": 0.024551282051281986,
"calib/mean_conf": 0.8706,
"calib/mu_c": 0.8823846153846154,
"calib/mu_w": 0.8578333333333334,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3505999999999999,
"calib/std_conf": 0.08131445136013649,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8277044025157234,
"calib/step_q_c_n": 318.0,
"calib/step_q_gap": 0.02500815678193502,
"calib/step_q_w": 0.8026962457337884,
"calib/step_q_w_n": 293.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 509.51953125,
"completions/mean_terminated_length": 509.51953125,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0416,
"grad_norm": 0.0069496543146669865,
"kl": 0.03579139709472656,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0183,
"num_tokens": 9538276.0,
"reward": 0.33049649000167847,
"reward_std": 0.18329477310180664,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6080214977264404,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.24077847599983215,
"step": 39
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5396220472440945,
"calib/avg_num_step_conf": 2.109375,
"calib/ece": 0.3734126984126984,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.23015873015873015,
"calib/gap": 0.0073826771653544565,
"calib/mean_conf": 0.874920634920635,
"calib/mu_c": 0.8785826771653544,
"calib/mu_w": 0.8712,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.37218253968253967,
"calib/std_conf": 0.05515844467209437,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.8313358778625954,
"calib/step_q_c_n": 262.0,
"calib/step_q_gap": 0.0030624965676313565,
"calib/step_q_w": 0.8282733812949641,
"calib/step_q_w_n": 278.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 503.171875,
"completions/mean_terminated_length": 503.171875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.006470134947448969,
"kl": 0.0303955078125,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0316,
"num_tokens": 9773848.0,
"reward": 0.2879479229450226,
"reward_std": 0.22962933778762817,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5858784914016724,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": -0.3006076514720917,
"step": 40
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4538764510779436,
"calib/avg_num_step_conf": 2.421875,
"calib/ece": 0.0720481927710844,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.21285140562248997,
"calib/gap": 0.013168532338308392,
"calib/mean_conf": 0.875421686746988,
"calib/mu_c": 0.8779601990049751,
"calib/mu_w": 0.8647916666666667,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.07012048192771092,
"calib/std_conf": 0.06796469444003686,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.8239172749391728,
"calib/step_q_c_n": 411.0,
"calib/step_q_gap": 0.4042043562788856,
"calib/step_q_w": 0.41971291866028715,
"calib/step_q_w_n": 209.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3052.0,
"completions/max_terminated_length": 3052.0,
"completions/mean_length": 457.76171875,
"completions/mean_terminated_length": 459.556884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.006787601392716169,
"kl": 0.0326080322265625,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0792,
"num_tokens": 9998283.0,
"reward": 0.5055362582206726,
"reward_std": 0.1886957734823227,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.7994054555892944,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": -0.13442669808864594,
"step": 41
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5079997450280469,
"calib/avg_num_step_conf": 2.19921875,
"calib/ece": 0.3095669291338584,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3464566929133858,
"calib/gap": -0.001931412544620037,
"calib/mean_conf": 0.8892519685039371,
"calib/mu_c": 0.8884459459459461,
"calib/mu_w": 0.8903773584905661,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3080708661417324,
"calib/std_conf": 0.040204099078814906,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8422923588039867,
"calib/step_q_c_n": 301.0,
"calib/step_q_gap": 0.005498465674215636,
"calib/step_q_w": 0.836793893129771,
"calib/step_q_w_n": 262.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1485.0,
"completions/max_terminated_length": 1485.0,
"completions/mean_length": 416.74609375,
"completions/mean_terminated_length": 418.38043212890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.0448,
"grad_norm": 0.006533120293170214,
"kl": 0.03894805908203125,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0001,
"num_tokens": 10209338.0,
"reward": 0.351402223110199,
"reward_std": 0.161943718791008,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6509097814559937,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.2606053054332733,
"step": 42
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.565625,
"calib/avg_num_step_conf": 2.01953125,
"calib/ece": 0.26007905138339915,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.383399209486166,
"calib/gap": 0.007639784946236516,
"calib/mean_conf": 0.8856916996047431,
"calib/mu_c": 0.8885,
"calib/mu_w": 0.8808602150537634,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25667984189723314,
"calib/std_conf": 0.05129947539851385,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8256676557863502,
"calib/step_q_c_n": 337.0,
"calib/step_q_gap": -0.012776788658094396,
"calib/step_q_w": 0.8384444444444445,
"calib/step_q_w_n": 180.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2265.0,
"completions/max_terminated_length": 2265.0,
"completions/mean_length": 451.59375,
"completions/mean_terminated_length": 451.59375,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.006118349730968475,
"kl": 0.03265380859375,
"learning_rate": 4.361111111111112e-06,
"loss": -0.002,
"num_tokens": 10430170.0,
"reward": 0.39366674423217773,
"reward_std": 0.2639361321926117,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6941671967506409,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.22870871424674988,
"step": 43
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6036445012787724,
"calib/avg_num_step_conf": 1.83203125,
"calib/ece": 0.3441035856573706,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4063745019920319,
"calib/gap": 0.0287698209718672,
"calib/mean_conf": 0.8859362549800797,
"calib/mu_c": 0.8991176470588236,
"calib/mu_w": 0.8703478260869564,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3441035856573706,
"calib/std_conf": 0.07584464568578271,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8515450643776825,
"calib/step_q_c_n": 233.0,
"calib/step_q_gap": 0.024426420309885843,
"calib/step_q_w": 0.8271186440677967,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2333.0,
"completions/max_terminated_length": 2333.0,
"completions/mean_length": 456.8203125,
"completions/mean_terminated_length": 460.4173278808594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.007396138738840818,
"kl": 0.033664703369140625,
"learning_rate": 4.333333333333334e-06,
"loss": 0.014,
"num_tokens": 10653436.0,
"reward": 0.31293433904647827,
"reward_std": 0.2180282473564148,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6254706978797913,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.3019458055496216,
"step": 44
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5447098302408212,
"calib/avg_num_step_conf": 1.82421875,
"calib/ece": 0.30597609561752986,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.49800796812749004,
"calib/gap": 0.01829714436110008,
"calib/mean_conf": 0.892430278884462,
"calib/mu_c": 0.8998657718120804,
"calib/mu_w": 0.8815686274509803,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.30239043824701195,
"calib/std_conf": 0.09723298363772212,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8685123966942149,
"calib/step_q_c_n": 242.0,
"calib/step_q_gap": 0.054823507805325966,
"calib/step_q_w": 0.8136888888888889,
"calib/step_q_w_n": 225.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2540.0,
"completions/max_terminated_length": 2540.0,
"completions/mean_length": 429.51171875,
"completions/mean_terminated_length": 432.8937072753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.048,
"grad_norm": 0.007078626658767462,
"kl": 0.041751861572265625,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0111,
"num_tokens": 10868439.0,
"reward": 0.35687994956970215,
"reward_std": 0.22213514149188995,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6519414186477661,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.24990034103393555,
"step": 45
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5891987179487179,
"calib/avg_num_step_conf": 1.80078125,
"calib/ece": 0.3853199999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.556,
"calib/gap": 0.01215384615384596,
"calib/mean_conf": 0.90532,
"calib/mu_c": 0.9111538461538461,
"calib/mu_w": 0.8990000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.3853199999999999,
"calib/std_conf": 0.03882393076441384,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.8754716981132077,
"calib/step_q_c_n": 212.0,
"calib/step_q_gap": 0.052138364779874435,
"calib/step_q_w": 0.8233333333333333,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2502.0,
"completions/max_terminated_length": 2502.0,
"completions/mean_length": 440.96875,
"completions/mean_terminated_length": 440.96875,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.006802982650697231,
"kl": 0.0423736572265625,
"learning_rate": 4.277777777777778e-06,
"loss": 0.0166,
"num_tokens": 11086095.0,
"reward": 0.2893574833869934,
"reward_std": 0.20869553089141846,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.581676185131073,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": -0.2959299087524414,
"step": 46
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5944444444444444,
"calib/avg_num_step_conf": 1.42578125,
"calib/ece": 0.30052845528455285,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6138211382113821,
"calib/gap": 0.011570833333333197,
"calib/mean_conf": 0.9102845528455284,
"calib/mu_c": 0.9148,
"calib/mu_w": 0.9032291666666667,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.30052845528455285,
"calib/std_conf": 0.03952358981202645,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.8533183856502243,
"calib/step_q_c_n": 223.0,
"calib/step_q_gap": -0.013090065054001121,
"calib/step_q_w": 0.8664084507042255,
"calib/step_q_w_n": 142.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 450.23828125,
"completions/mean_terminated_length": 453.7834777832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.007135537452995777,
"kl": 0.040294647216796875,
"learning_rate": 4.25e-06,
"loss": -0.0103,
"num_tokens": 11307332.0,
"reward": 0.3479617238044739,
"reward_std": 0.20676106214523315,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6331160068511963,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": -0.24266132712364197,
"step": 47
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5348680840163934,
"calib/avg_num_step_conf": 1.7578125,
"calib/ece": 0.4072,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.62,
"calib/gap": -0.0017546106557376762,
"calib/mean_conf": 0.9032,
"calib/mu_c": 0.9023437499999999,
"calib/mu_w": 0.9040983606557376,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3992,
"calib/std_conf": 0.10269255084961128,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8484304932735426,
"calib/step_q_c_n": 223.0,
"calib/step_q_gap": -0.01430078425949699,
"calib/step_q_w": 0.8627312775330396,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2224.0,
"completions/max_terminated_length": 2224.0,
"completions/mean_length": 420.06640625,
"completions/mean_terminated_length": 425.0474548339844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.0512,
"grad_norm": 0.006774710491299629,
"kl": 0.046047210693359375,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0519,
"num_tokens": 11518557.0,
"reward": 0.2743079960346222,
"reward_std": 0.24875454604625702,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5700136423110962,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.3151476979255676,
"step": 48
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5793089513032937,
"calib/avg_num_step_conf": 1.53515625,
"calib/ece": 0.3375000000000001,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8225806451612904,
"calib/gap": 0.010764464201522217,
"calib/mean_conf": 0.9302419354838709,
"calib/mu_c": 0.9346258503401361,
"calib/mu_w": 0.9238613861386139,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3375000000000001,
"calib/std_conf": 0.034964583686681205,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.908986784140969,
"calib/step_q_c_n": 227.0,
"calib/step_q_gap": 0.05904702510482429,
"calib/step_q_w": 0.8499397590361447,
"calib/step_q_w_n": 166.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2719.0,
"completions/max_terminated_length": 2719.0,
"completions/mean_length": 415.40625,
"completions/mean_terminated_length": 417.0353088378906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.00680144689977169,
"kl": 0.053955078125,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0279,
"num_tokens": 11729437.0,
"reward": 0.32491326332092285,
"reward_std": 0.20095054805278778,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.623964786529541,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.2811695337295532,
"step": 49
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6212825933756165,
"calib/avg_num_step_conf": 1.57421875,
"calib/ece": 0.2795617529880476,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8725099601593626,
"calib/gap": 0.015852008456659794,
"calib/mean_conf": 0.9369322709163347,
"calib/mu_c": 0.9423636363636364,
"calib/mu_w": 0.9265116279069766,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2795617529880476,
"calib/std_conf": 0.034832430314519346,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.9162256809338523,
"calib/step_q_c_n": 257.0,
"calib/step_q_gap": 0.049033900111934314,
"calib/step_q_w": 0.867191780821918,
"calib/step_q_w_n": 146.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2453.0,
"completions/max_terminated_length": 2453.0,
"completions/mean_length": 452.24609375,
"completions/mean_terminated_length": 452.24609375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.006964311469346285,
"kl": 0.045654296875,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0524,
"num_tokens": 11950572.0,
"reward": 0.37954503297805786,
"reward_std": 0.21893005073070526,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6810219287872314,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.24536937475204468,
"step": 50
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5048956546598056,
"calib/avg_num_step_conf": 1.4453125,
"calib/ece": 0.3005668016194331,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9271255060728745,
"calib/gap": 0.003136077758719269,
"calib/mean_conf": 0.9442914979757084,
"calib/mu_c": 0.9454088050314465,
"calib/mu_w": 0.9422727272727273,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.3005668016194331,
"calib/std_conf": 0.02931408942209797,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.9142060085836909,
"calib/step_q_c_n": 233.0,
"calib/step_q_gap": 0.07362206697785145,
"calib/step_q_w": 0.8405839416058395,
"calib/step_q_w_n": 137.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2935.0,
"completions/max_terminated_length": 2935.0,
"completions/mean_length": 436.2578125,
"completions/mean_terminated_length": 441.43084716796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.0544,
"grad_norm": 0.0068186987191438675,
"kl": 0.0561676025390625,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0299,
"num_tokens": 12171550.0,
"reward": 0.3507247269153595,
"reward_std": 0.22434017062187195,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6525163650512695,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.26591068506240845,
"step": 51
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6222362105594461,
"calib/avg_num_step_conf": 1.375,
"calib/ece": 0.22187999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.05683924777716587,
"calib/mean_conf": 0.93788,
"calib/mu_c": 0.9540223463687152,
"calib/mu_w": 0.8971830985915493,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.22187999999999994,
"calib/std_conf": 0.09374809651400928,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.9365725806451612,
"calib/step_q_c_n": 248.0,
"calib/step_q_gap": 0.08916873449131513,
"calib/step_q_w": 0.8474038461538461,
"calib/step_q_w_n": 104.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2472.0,
"completions/max_terminated_length": 2472.0,
"completions/mean_length": 431.87890625,
"completions/mean_terminated_length": 435.279541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.013513702899217606,
"kl": 0.07970809936523438,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0306,
"num_tokens": 12390063.0,
"reward": 0.4429655075073242,
"reward_std": 0.21031616628170013,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7162933349609375,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": -0.1592685878276825,
"step": 52
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6598223824786325,
"calib/avg_num_step_conf": 1.3671875,
"calib/ece": 0.3287698412698412,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9206349206349206,
"calib/gap": 0.022051282051281706,
"calib/mean_conf": 0.9478174603174603,
"calib/mu_c": 0.9562179487179484,
"calib/mu_w": 0.9341666666666667,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3287698412698412,
"calib/std_conf": 0.038426367599388646,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.9468877551020408,
"calib/step_q_c_n": 196.0,
"calib/step_q_gap": 0.07597866419294985,
"calib/step_q_w": 0.870909090909091,
"calib/step_q_w_n": 154.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1601.0,
"completions/max_terminated_length": 1601.0,
"completions/mean_length": 425.6953125,
"completions/mean_terminated_length": 427.36474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.006831077393144369,
"kl": 0.040790557861328125,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0349,
"num_tokens": 12604865.0,
"reward": 0.34930217266082764,
"reward_std": 0.19114413857460022,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6468698978424072,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.26545315980911255,
"step": 53
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6534778225806452,
"calib/avg_num_step_conf": 1.828125,
"calib/ece": 0.1966929133858267,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9330708661417323,
"calib/gap": 0.05971774193548396,
"calib/mean_conf": 0.9441732283464568,
"calib/mu_c": 0.9587500000000001,
"calib/mu_w": 0.8990322580645161,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19248031496062984,
"calib/std_conf": 0.11960197075827311,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9289714285714286,
"calib/step_q_c_n": 350.0,
"calib/step_q_gap": 0.09202227602905577,
"calib/step_q_w": 0.8369491525423728,
"calib/step_q_w_n": 118.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1097.0,
"completions/max_terminated_length": 1097.0,
"completions/mean_length": 374.33984375,
"completions/mean_terminated_length": 375.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.0576,
"grad_norm": 0.007281064055860043,
"kl": 0.052005767822265625,
"learning_rate": 4.055555555555556e-06,
"loss": -0.004,
"num_tokens": 12806928.0,
"reward": 0.4682023525238037,
"reward_std": 0.15885430574417114,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7816210985183716,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.19365385174751282,
"step": 54
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5886446886446887,
"calib/avg_num_step_conf": 1.80078125,
"calib/ece": 0.4545703125000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9375,
"calib/gap": 0.04373748473748473,
"calib/mean_conf": 0.9467578125,
"calib/mu_c": 0.9689682539682539,
"calib/mu_w": 0.9252307692307692,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4545703125000001,
"calib/std_conf": 0.10804384570726294,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9283193277310924,
"calib/step_q_c_n": 238.0,
"calib/step_q_gap": 0.050561480197460096,
"calib/step_q_w": 0.8777578475336323,
"calib/step_q_w_n": 223.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1149.0,
"completions/max_terminated_length": 1149.0,
"completions/mean_length": 412.046875,
"completions/mean_terminated_length": 413.66278076171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.00746389152482152,
"kl": 0.04831695556640625,
"learning_rate": 4.027777777777779e-06,
"loss": 0.0122,
"num_tokens": 13020236.0,
"reward": 0.2420576512813568,
"reward_std": 0.26765334606170654,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5534613132476807,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.36622104048728943,
"step": 55
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5570409982174688,
"calib/avg_num_step_conf": 2.00390625,
"calib/ece": 0.35388888888888886,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9523809523809523,
"calib/gap": 0.013107546048722551,
"calib/mean_conf": 0.9600793650793652,
"calib/mu_c": 0.9652287581699347,
"calib/mu_w": 0.9521212121212121,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.3534126984126984,
"calib/std_conf": 0.0583910738958275,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.8669716088328077,
"calib/step_q_c_n": 317.0,
"calib/step_q_gap": 0.008502221077705485,
"calib/step_q_w": 0.8584693877551022,
"calib/step_q_w_n": 196.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 455.26953125,
"completions/mean_terminated_length": 455.26953125,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.006870965473353863,
"kl": 0.040874481201171875,
"learning_rate": 4.000000000000001e-06,
"loss": 0.047,
"num_tokens": 13243625.0,
"reward": 0.3485686480998993,
"reward_std": 0.21139225363731384,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6283090114593506,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.24289044737815857,
"step": 56
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5856104651162791,
"calib/avg_num_step_conf": 1.62890625,
"calib/ece": 0.2908730158730158,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9603174603174603,
"calib/gap": 0.015034883720930203,
"calib/mean_conf": 0.9647619047619047,
"calib/mu_c": 0.9695348837209302,
"calib/mu_w": 0.9545,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.28654761904761894,
"calib/std_conf": 0.08153015106621486,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.942695035460993,
"calib/step_q_c_n": 282.0,
"calib/step_q_gap": 0.01780614657210422,
"calib/step_q_w": 0.9248888888888888,
"calib/step_q_w_n": 135.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3005.0,
"completions/max_terminated_length": 3005.0,
"completions/mean_length": 431.2265625,
"completions/mean_terminated_length": 432.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0608,
"grad_norm": 0.0062557547353208065,
"kl": 0.048091888427734375,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0289,
"num_tokens": 13460811.0,
"reward": 0.38320237398147583,
"reward_std": 0.21609210968017578,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6925468444824219,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.25739216804504395,
"step": 57
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.47277458961697594,
"calib/avg_num_step_conf": 1.75390625,
"calib/ece": 0.45485714285714296,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9755102040816327,
"calib/gap": -0.0031923128253035227,
"calib/mean_conf": 0.9678367346938775,
"calib/mu_c": 0.9662992125984252,
"calib/mu_w": 0.9694915254237287,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.45216326530612255,
"calib/std_conf": 0.04296906668032938,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.8950500000000001,
"calib/step_q_c_n": 200.0,
"calib/step_q_gap": 0.08822269076305223,
"calib/step_q_w": 0.8068273092369479,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2044.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 514.9296875,
"completions/mean_terminated_length": 518.9842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.006043759640306234,
"kl": 0.043193817138671875,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0193,
"num_tokens": 13698953.0,
"reward": 0.2258208990097046,
"reward_std": 0.29372695088386536,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5175651907920837,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.355767160654068,
"step": 58
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5737896169550845,
"calib/avg_num_step_conf": 1.9921875,
"calib/ece": 0.40988000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.932,
"calib/gap": 0.030208697906539905,
"calib/mean_conf": 0.9507599999999999,
"calib/mu_c": 0.9641726618705037,
"calib/mu_w": 0.9339639639639638,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4023200000000001,
"calib/std_conf": 0.1336855354928124,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.9121455938697319,
"calib/step_q_c_n": 261.0,
"calib/step_q_gap": 0.15475603563680007,
"calib/step_q_w": 0.7573895582329319,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2455.0,
"completions/max_terminated_length": 2455.0,
"completions/mean_length": 437.203125,
"completions/mean_terminated_length": 440.6456604003906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.006019299384206533,
"kl": 0.050403594970703125,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0107,
"num_tokens": 13917125.0,
"reward": 0.2786928415298462,
"reward_std": 0.25457003712654114,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5765078067779541,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.32224711775779724,
"step": 59
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5588255124056095,
"calib/avg_num_step_conf": 2.07421875,
"calib/ece": 0.38817813765182196,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9676113360323887,
"calib/gap": 0.011173139158576206,
"calib/mean_conf": 0.971174089068826,
"calib/mu_c": 0.9758333333333332,
"calib/mu_w": 0.964660194174757,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.38817813765182196,
"calib/std_conf": 0.04108643570655636,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.9142356687898091,
"calib/step_q_c_n": 314.0,
"calib/step_q_gap": 0.044420000587044095,
"calib/step_q_w": 0.869815668202765,
"calib/step_q_w_n": 217.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2847.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 463.4609375,
"completions/mean_terminated_length": 468.95654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.064,
"grad_norm": 0.006463638506829739,
"kl": 0.053012847900390625,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0161,
"num_tokens": 14144627.0,
"reward": 0.2865564227104187,
"reward_std": 0.24216191470623016,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5803898572921753,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": -0.30962073802948,
"step": 60
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5601492965834052,
"calib/avg_num_step_conf": 2.45703125,
"calib/ece": 0.28454545454545443,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9407114624505929,
"calib/gap": 0.03187697387309785,
"calib/mean_conf": 0.9643873517786562,
"calib/mu_c": 0.9745930232558139,
"calib/mu_w": 0.942716049382716,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.28454545454545443,
"calib/std_conf": 0.07765409363964547,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8914861460957177,
"calib/step_q_c_n": 397.0,
"calib/step_q_gap": 0.10708959437157972,
"calib/step_q_w": 0.784396551724138,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1178.0,
"completions/max_terminated_length": 1178.0,
"completions/mean_length": 416.92578125,
"completions/mean_terminated_length": 418.5608215332031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.0071130190044641495,
"kl": 0.05157470703125,
"learning_rate": 3.861111111111112e-06,
"loss": 0.014,
"num_tokens": 14355424.0,
"reward": 0.384852796792984,
"reward_std": 0.2306276559829712,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6892913579940796,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.2500545382499695,
"step": 61
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5288929840726603,
"calib/avg_num_step_conf": 2.08203125,
"calib/ece": 0.40590361445783146,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9317269076305221,
"calib/gap": -0.00025536395945779944,
"calib/mean_conf": 0.9554618473895583,
"calib/mu_c": 0.9553521126760561,
"calib/mu_w": 0.9556074766355139,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.39554216867469894,
"calib/std_conf": 0.1157717579982303,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.8620863309352517,
"calib/step_q_c_n": 278.0,
"calib/step_q_gap": 0.09330201720976139,
"calib/step_q_w": 0.7687843137254903,
"calib/step_q_w_n": 255.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2821.0,
"completions/max_terminated_length": 2821.0,
"completions/mean_length": 466.50390625,
"completions/mean_terminated_length": 468.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.005557631608098745,
"kl": 0.043121337890625,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0031,
"num_tokens": 14581929.0,
"reward": 0.25923866033554077,
"reward_std": 0.3093259930610657,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5729343891143799,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.35914450883865356,
"step": 62
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6637687969924813,
"calib/avg_num_step_conf": 1.73046875,
"calib/ece": 0.36468,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.944,
"calib/gap": 0.020064446831364324,
"calib/mean_conf": 0.9692400000000002,
"calib/mu_c": 0.9771052631578948,
"calib/mu_w": 0.9570408163265305,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.36296,
"calib/std_conf": 0.04494243429099051,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.918637992831541,
"calib/step_q_c_n": 279.0,
"calib/step_q_gap": 0.07058921234373627,
"calib/step_q_w": 0.8480487804878047,
"calib/step_q_w_n": 164.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 501.34765625,
"completions/mean_terminated_length": 503.3137512207031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.0672,
"grad_norm": 0.005798683501780033,
"kl": 0.044086456298828125,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0373,
"num_tokens": 14818914.0,
"reward": 0.3223249912261963,
"reward_std": 0.2504933476448059,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6075301170349121,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.2722550630569458,
"step": 63
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5043233082706767,
"calib/avg_num_step_conf": 1.9765625,
"calib/ece": 0.2878486055776894,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8605577689243028,
"calib/gap": 0.004703759398496543,
"calib/mean_conf": 0.9174900398406375,
"calib/mu_c": 0.9189142857142857,
"calib/mu_w": 0.9142105263157891,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.25406374501992046,
"calib/std_conf": 0.18744368317696553,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.8397640117994102,
"calib/step_q_c_n": 339.0,
"calib/step_q_gap": 0.02072209563174565,
"calib/step_q_w": 0.8190419161676645,
"calib/step_q_w_n": 167.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2550.0,
"completions/max_terminated_length": 2550.0,
"completions/mean_length": 435.98828125,
"completions/mean_terminated_length": 437.69805908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.006303312722593546,
"kl": 0.049251556396484375,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0548,
"num_tokens": 15034303.0,
"reward": 0.3864745497703552,
"reward_std": 0.2799364924430847,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.6870027184486389,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.24530363082885742,
"step": 64
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5792896283092362,
"calib/avg_num_step_conf": 1.7890625,
"calib/ece": 0.37591269841269853,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9563492063492064,
"calib/gap": 0.0002792632204396961,
"calib/mean_conf": 0.9728968253968254,
"calib/mu_c": 0.9730065359477126,
"calib/mu_w": 0.9727272727272729,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.37083333333333346,
"calib/std_conf": 0.0524625836986189,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.9113553113553114,
"calib/step_q_c_n": 273.0,
"calib/step_q_gap": -0.008158202158202177,
"calib/step_q_w": 0.9195135135135136,
"calib/step_q_w_n": 185.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2712.0,
"completions/max_terminated_length": 2712.0,
"completions/mean_length": 380.93359375,
"completions/mean_terminated_length": 383.9330749511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.006032762583345175,
"kl": 0.0626678466796875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.011,
"num_tokens": 15236846.0,
"reward": 0.2991948425769806,
"reward_std": 0.1917351335287094,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.5997199416160583,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.3146114945411682,
"step": 65
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6051844783715014,
"calib/avg_num_step_conf": 1.91796875,
"calib/ece": 0.4349800796812749,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9083665338645418,
"calib/gap": 0.03393702290076328,
"calib/mean_conf": 0.9444621513944225,
"calib/mu_c": 0.9606870229007634,
"calib/mu_w": 0.9267500000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.4287649402390438,
"calib/std_conf": 0.14269185575553064,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.9092703862660945,
"calib/step_q_c_n": 233.0,
"calib/step_q_gap": 0.1638052699870246,
"calib/step_q_w": 0.7454651162790699,
"calib/step_q_w_n": 258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2388.0,
"completions/max_terminated_length": 2388.0,
"completions/mean_length": 492.7421875,
"completions/mean_terminated_length": 494.6745300292969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.0704,
"grad_norm": 0.005543500185012817,
"kl": 0.049045562744140625,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0261,
"num_tokens": 15469340.0,
"reward": 0.26769766211509705,
"reward_std": 0.24292393028736115,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5495148301124573,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.31021326780319214,
"step": 66
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6902941778718176,
"calib/avg_num_step_conf": 1.7890625,
"calib/ece": 0.32178571428571445,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9404761904761905,
"calib/gap": 0.04612040133779283,
"calib/mean_conf": 0.9606746031746033,
"calib/mu_c": 0.977329192546584,
"calib/mu_w": 0.9312087912087912,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.32178571428571445,
"calib/std_conf": 0.10923226523193443,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.9151438848920863,
"calib/step_q_c_n": 278.0,
"calib/step_q_gap": 0.14947721822541948,
"calib/step_q_w": 0.7656666666666668,
"calib/step_q_w_n": 180.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2895.0,
"completions/max_terminated_length": 2895.0,
"completions/mean_length": 461.453125,
"completions/mean_terminated_length": 461.453125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.006038357503712177,
"kl": 0.05561065673828125,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0241,
"num_tokens": 15692480.0,
"reward": 0.36475488543510437,
"reward_std": 0.1881740391254425,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.660184383392334,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.2517683506011963,
"step": 67
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7296124927703874,
"calib/avg_num_step_conf": 2.328125,
"calib/ece": 0.41103999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.864,
"calib/gap": 0.06692500481974173,
"calib/mean_conf": 0.94304,
"calib/mu_c": 0.9743609022556391,
"calib/mu_w": 0.9074358974358974,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.41103999999999996,
"calib/std_conf": 0.1097212759677903,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.873956834532374,
"calib/step_q_c_n": 278.0,
"calib/step_q_gap": 0.26574928736256265,
"calib/step_q_w": 0.6082075471698114,
"calib/step_q_w_n": 318.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 498.98828125,
"completions/mean_terminated_length": 500.94512939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.005618997849524021,
"kl": 0.052722930908203125,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0285,
"num_tokens": 15924309.0,
"reward": 0.2997767925262451,
"reward_std": 0.23451535403728485,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5811023712158203,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.2784237563610077,
"step": 68
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6614257812500001,
"calib/avg_num_step_conf": 1.93359375,
"calib/ece": 0.4521774193548388,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8024193548387096,
"calib/gap": 0.04785937500000015,
"calib/mean_conf": 0.9110483870967742,
"calib/mu_c": 0.9357500000000002,
"calib/mu_w": 0.887890625,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.4396774193548388,
"calib/std_conf": 0.17850100997449472,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8308000000000001,
"calib/step_q_c_n": 200.0,
"calib/step_q_gap": 0.09486779661016953,
"calib/step_q_w": 0.7359322033898306,
"calib/step_q_w_n": 295.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2933.0,
"completions/max_terminated_length": 2933.0,
"completions/mean_length": 543.10546875,
"completions/mean_terminated_length": 547.3818969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.0736,
"grad_norm": 0.005082065239548683,
"kl": 0.041614532470703125,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0109,
"num_tokens": 16167840.0,
"reward": 0.2741231620311737,
"reward_std": 0.29141664505004883,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5398664474487305,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.2775576114654541,
"step": 69
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6298890429958391,
"calib/avg_num_step_conf": 1.99609375,
"calib/ece": 0.35407407407407404,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8271604938271605,
"calib/gap": 0.08578224687933456,
"calib/mean_conf": 0.8983539094650206,
"calib/mu_c": 0.9347142857142858,
"calib/mu_w": 0.8489320388349513,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.3381481481481481,
"calib/std_conf": 0.21164398210110583,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.8400694444444444,
"calib/step_q_c_n": 288.0,
"calib/step_q_gap": 0.13997975834578968,
"calib/step_q_w": 0.7000896860986547,
"calib/step_q_w_n": 223.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 541.81640625,
"completions/mean_terminated_length": 543.9412231445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.005732155870646238,
"kl": 0.042018890380859375,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0214,
"num_tokens": 16413537.0,
"reward": 0.3209608197212219,
"reward_std": 0.2563707232475281,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6122055053710938,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": -0.2687213718891144,
"step": 70
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6134592910011687,
"calib/avg_num_step_conf": 1.984375,
"calib/ece": 0.3526482213438736,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8181818181818182,
"calib/gap": 0.021333593039864707,
"calib/mean_conf": 0.9184189723320159,
"calib/mu_c": 0.9270198675496686,
"calib/mu_w": 0.9056862745098039,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3371146245059289,
"calib/std_conf": 0.1589039482781675,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8082517482517484,
"calib/step_q_c_n": 286.0,
"calib/step_q_gap": 0.07293643293643304,
"calib/step_q_w": 0.7353153153153154,
"calib/step_q_w_n": 222.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2579.0,
"completions/max_terminated_length": 2579.0,
"completions/mean_length": 455.125,
"completions/mean_terminated_length": 455.125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.005377875175327063,
"kl": 0.052242279052734375,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0012,
"num_tokens": 16634457.0,
"reward": 0.34045037627220154,
"reward_std": 0.27277880907058716,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6282824277877808,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.2614441514015198,
"step": 71
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7194603174603174,
"calib/avg_num_step_conf": 2.109375,
"calib/ece": 0.3391764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8274509803921568,
"calib/gap": 0.091447619047619,
"calib/mean_conf": 0.9274117647058824,
"calib/mu_c": 0.9650666666666666,
"calib/mu_w": 0.8736190476190476,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3391764705882353,
"calib/std_conf": 0.14417706658223547,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.8171304347826087,
"calib/step_q_c_n": 345.0,
"calib/step_q_gap": 0.12800222965440367,
"calib/step_q_w": 0.689128205128205,
"calib/step_q_w_n": 195.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1392.0,
"completions/max_terminated_length": 1392.0,
"completions/mean_length": 424.66015625,
"completions/mean_terminated_length": 426.32550048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.0768,
"grad_norm": 0.006489574443548918,
"kl": 0.058135986328125,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0029,
"num_tokens": 16847578.0,
"reward": 0.3816695511341095,
"reward_std": 0.19125699996948242,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.658865213394165,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.20880737900733948,
"step": 72
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6533471359558316,
"calib/avg_num_step_conf": 1.953125,
"calib/ece": 0.29110756972111546,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8406374501992032,
"calib/gap": 0.09448378191856455,
"calib/mean_conf": 0.9244940239043826,
"calib/mu_c": 0.9583726708074535,
"calib/mu_w": 0.8638888888888889,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.2870836653386454,
"calib/std_conf": 0.1598070891455655,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.8070538922155687,
"calib/step_q_c_n": 334.0,
"calib/step_q_gap": 0.042234615107135,
"calib/step_q_w": 0.7648192771084337,
"calib/step_q_w_n": 166.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 472.703125,
"completions/mean_terminated_length": 472.703125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.005386182572692633,
"kl": 0.046176910400390625,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0025,
"num_tokens": 17075622.0,
"reward": 0.3789359927177429,
"reward_std": 0.2643182575702667,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6753011345863342,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": -0.23461660742759705,
"step": 73
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6607167566071677,
"calib/avg_num_step_conf": 2.27734375,
"calib/ece": 0.32689795918367354,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7795918367346939,
"calib/gap": 0.07555901480559035,
"calib/mean_conf": 0.8896734693877553,
"calib/mu_c": 0.920205479452055,
"calib/mu_w": 0.8446464646464646,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.31032653061224497,
"calib/std_conf": 0.21682659183960734,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7695512820512821,
"calib/step_q_c_n": 312.0,
"calib/step_q_gap": 0.15338892042766605,
"calib/step_q_w": 0.616162361623616,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 471.70703125,
"completions/mean_terminated_length": 477.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.0063706678338348866,
"kl": 0.051799774169921875,
"learning_rate": 3.5e-06,
"loss": 0.0317,
"num_tokens": 17300307.0,
"reward": 0.3643389344215393,
"reward_std": 0.22409476339817047,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6295297145843506,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": -0.20475810766220093,
"step": 74
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7152218152218153,
"calib/avg_num_step_conf": 2.171875,
"calib/ece": 0.16940944881889786,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7755905511811023,
"calib/gap": 0.13941961741961728,
"calib/mean_conf": 0.9071259842519686,
"calib/mu_c": 0.9428042328042328,
"calib/mu_w": 0.8033846153846155,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1662204724409451,
"calib/std_conf": 0.16891970287012711,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7913853904282114,
"calib/step_q_c_n": 397.0,
"calib/step_q_gap": 0.17094513885588436,
"calib/step_q_w": 0.6204402515723271,
"calib/step_q_w_n": 159.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 413.25390625,
"completions/mean_terminated_length": 414.8745422363281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.08,
"grad_norm": 0.006375730037689209,
"kl": 0.05806732177734375,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0151,
"num_tokens": 17510852.0,
"reward": 0.49380946159362793,
"reward_std": 0.20836231112480164,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.78885817527771,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.14420804381370544,
"step": 75
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.728522920203735,
"calib/avg_num_step_conf": 1.8359375,
"calib/ece": 0.27392,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.68,
"calib/gap": 0.14338879456706288,
"calib/mean_conf": 0.84848,
"calib/mu_c": 0.9029677419354838,
"calib/mu_w": 0.7595789473684209,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2512,
"calib/std_conf": 0.2451923522461498,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7692057761732852,
"calib/step_q_c_n": 277.0,
"calib/step_q_gap": 0.17977572435981382,
"calib/step_q_w": 0.5894300518134714,
"calib/step_q_w_n": 193.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2904.0,
"completions/max_terminated_length": 2904.0,
"completions/mean_length": 464.3984375,
"completions/mean_terminated_length": 466.2196350097656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.0066471709869802,
"kl": 0.058727264404296875,
"learning_rate": 3.444444444444445e-06,
"loss": 0.062,
"num_tokens": 17732794.0,
"reward": 0.4030284285545349,
"reward_std": 0.258221834897995,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.702775776386261,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.21312521398067474,
"step": 76
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6567358400109581,
"calib/avg_num_step_conf": 2.41015625,
"calib/ece": 0.2280799999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.536,
"calib/gap": 0.13451133484007938,
"calib/mean_conf": 0.782,
"calib/mu_c": 0.8320382165605096,
"calib/mu_w": 0.6975268817204302,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.19103999999999982,
"calib/std_conf": 0.2722631080407333,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6617705735660848,
"calib/step_q_c_n": 401.0,
"calib/step_q_gap": 0.12436316615867737,
"calib/step_q_w": 0.5374074074074074,
"calib/step_q_w_n": 216.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2475.0,
"completions/max_terminated_length": 2475.0,
"completions/mean_length": 483.44140625,
"completions/mean_terminated_length": 485.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.0059607732109725475,
"kl": 0.056232452392578125,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0241,
"num_tokens": 17961219.0,
"reward": 0.4120379686355591,
"reward_std": 0.20473268628120422,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7089191675186157,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.20124945044517517,
"step": 77
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6365384615384616,
"calib/avg_num_step_conf": 2.375,
"calib/ece": 0.23976095617529875,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5418326693227091,
"calib/gap": 0.14284821428571448,
"calib/mean_conf": 0.7667729083665339,
"calib/mu_c": 0.8185625000000002,
"calib/mu_w": 0.6757142857142857,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18454183266932267,
"calib/std_conf": 0.29337088865439276,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5924303797468354,
"calib/step_q_c_n": 395.0,
"calib/step_q_gap": 0.04768859570927664,
"calib/step_q_w": 0.5447417840375588,
"calib/step_q_w_n": 213.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2867.0,
"completions/max_terminated_length": 2867.0,
"completions/mean_length": 510.5078125,
"completions/mean_terminated_length": 512.5098266601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.0832,
"grad_norm": 0.005696735344827175,
"kl": 0.05327606201171875,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.022,
"num_tokens": 18199933.0,
"reward": 0.4230232834815979,
"reward_std": 0.22738784551620483,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7163605690002441,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.19062650203704834,
"step": 78
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6093189964157706,
"calib/avg_num_step_conf": 2.15625,
"calib/ece": 0.2187450980392157,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.47843137254901963,
"calib/gap": 0.10993628036638792,
"calib/mean_conf": 0.7760784313725491,
"calib/mu_c": 0.8161728395061728,
"calib/mu_w": 0.7062365591397849,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.179764705882353,
"calib/std_conf": 0.2641440369947661,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6336994219653179,
"calib/step_q_c_n": 346.0,
"calib/step_q_gap": 0.0751071889556092,
"calib/step_q_w": 0.5585922330097087,
"calib/step_q_w_n": 206.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2250.0,
"completions/max_terminated_length": 2250.0,
"completions/mean_length": 475.4921875,
"completions/mean_terminated_length": 475.4921875,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.005750508978962898,
"kl": 0.047275543212890625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0058,
"num_tokens": 18428035.0,
"reward": 0.42249900102615356,
"reward_std": 0.21679085493087769,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7268054485321045,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.20758873224258423,
"step": 79
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6096211153682418,
"calib/avg_num_step_conf": 2.63671875,
"calib/ece": 0.2477647058823531,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5803921568627451,
"calib/gap": 0.037711792252022036,
"calib/mean_conf": 0.8214117647058824,
"calib/mu_c": 0.8333908045977011,
"calib/mu_w": 0.7956790123456791,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1934117647058825,
"calib/std_conf": 0.2444817789810696,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6234232209737828,
"calib/step_q_c_n": 445.0,
"calib/step_q_gap": 0.053944960104217565,
"calib/step_q_w": 0.5694782608695652,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 409.6953125,
"completions/mean_terminated_length": 409.6953125,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.005894418340176344,
"kl": 0.0623931884765625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.018,
"num_tokens": 18635077.0,
"reward": 0.4095434546470642,
"reward_std": 0.21538078784942627,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7176773548126221,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.23374667763710022,
"step": 80
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.623547335600907,
"calib/avg_num_step_conf": 2.45703125,
"calib/ece": 0.24091269841269836,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5119047619047619,
"calib/gap": 0.08642857142857152,
"calib/mean_conf": 0.7660714285714286,
"calib/mu_c": 0.7948809523809525,
"calib/mu_w": 0.708452380952381,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1701587301587301,
"calib/std_conf": 0.2805678108501771,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6212747875354108,
"calib/step_q_c_n": 353.0,
"calib/step_q_gap": 0.1786298599991788,
"calib/step_q_w": 0.44264492753623197,
"calib/step_q_w_n": 276.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2770.0,
"completions/max_terminated_length": 2770.0,
"completions/mean_length": 441.99609375,
"completions/mean_terminated_length": 445.47637939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0864,
"grad_norm": 0.006212920416146517,
"kl": 0.050567626953125,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0056,
"num_tokens": 18854476.0,
"reward": 0.4199260473251343,
"reward_std": 0.2087436318397522,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7162222266197205,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.2044951617717743,
"step": 81
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7415920010388262,
"calib/avg_num_step_conf": 2.515625,
"calib/ece": 0.23395256916996054,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5217391304347826,
"calib/gap": 0.15813465783664438,
"calib/mean_conf": 0.8060474308300396,
"calib/mu_c": 0.8698013245033113,
"calib/mu_w": 0.7116666666666669,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22158102766798424,
"calib/std_conf": 0.2462384800445783,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6318965517241381,
"calib/step_q_c_n": 348.0,
"calib/step_q_gap": 0.1533830382106246,
"calib/step_q_w": 0.47851351351351346,
"calib/step_q_w_n": 296.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 427.6484375,
"completions/mean_terminated_length": 427.6484375,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.006339720916002989,
"kl": 0.059291839599609375,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0334,
"num_tokens": 19069506.0,
"reward": 0.4325372874736786,
"reward_std": 0.19591914117336273,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7225097417831421,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.17306017875671387,
"step": 82
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7263938124395747,
"calib/avg_num_step_conf": 2.27734375,
"calib/ece": 0.2421428571428571,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5198412698412699,
"calib/gap": 0.2121882049629391,
"calib/mean_conf": 0.7561111111111112,
"calib/mu_c": 0.8462068965517241,
"calib/mu_w": 0.634018691588785,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2114285714285714,
"calib/std_conf": 0.30521257807149943,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6393235294117647,
"calib/step_q_c_n": 340.0,
"calib/step_q_gap": 0.16500254175744378,
"calib/step_q_w": 0.47432098765432096,
"calib/step_q_w_n": 243.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 461.54296875,
"completions/mean_terminated_length": 465.1771545410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.005662387236952782,
"kl": 0.052764892578125,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0617,
"num_tokens": 19294925.0,
"reward": 0.42235541343688965,
"reward_std": 0.21044637262821198,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7220921516418457,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.18753761053085327,
"step": 83
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6842507645259939,
"calib/avg_num_step_conf": 2.26171875,
"calib/ece": 0.20549407114624516,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.40711462450592883,
"calib/gap": 0.17332441386340436,
"calib/mean_conf": 0.7326877470355732,
"calib/mu_c": 0.8073611111111111,
"calib/mu_w": 0.6340366972477067,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18450592885375502,
"calib/std_conf": 0.28295848333032897,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.662012012012012,
"calib/step_q_c_n": 333.0,
"calib/step_q_gap": 0.1669099794916868,
"calib/step_q_w": 0.4951020325203252,
"calib/step_q_w_n": 246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1560.0,
"completions/max_terminated_length": 1560.0,
"completions/mean_length": 412.18359375,
"completions/mean_terminated_length": 413.8000183105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0896,
"grad_norm": 0.006523535121232271,
"kl": 0.056682586669921875,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0174,
"num_tokens": 19506364.0,
"reward": 0.40939784049987793,
"reward_std": 0.222348153591156,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7226859331130981,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.21404653787612915,
"step": 84
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7190209790209792,
"calib/avg_num_step_conf": 2.66015625,
"calib/ece": 0.21930041152263377,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.448559670781893,
"calib/gap": 0.17769020979020966,
"calib/mean_conf": 0.7566666666666667,
"calib/mu_c": 0.8297902097902098,
"calib/mu_w": 0.6521000000000001,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.19374485596707822,
"calib/std_conf": 0.27149032374059606,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6223906705539358,
"calib/step_q_c_n": 343.0,
"calib/step_q_gap": 0.16008297824624346,
"calib/step_q_w": 0.46230769230769236,
"calib/step_q_w_n": 338.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2584.0,
"completions/max_terminated_length": 2584.0,
"completions/mean_length": 482.578125,
"completions/mean_terminated_length": 492.1912536621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.005401519127190113,
"kl": 0.051334381103515625,
"learning_rate": 3.1944444444444443e-06,
"loss": 0.0206,
"num_tokens": 19737728.0,
"reward": 0.42077910900115967,
"reward_std": 0.20061340928077698,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6981011629104614,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": -0.1573241800069809,
"step": 85
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7217864923747277,
"calib/avg_num_step_conf": 2.390625,
"calib/ece": 0.16023622047244102,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2992125984251969,
"calib/gap": 0.23603112356053513,
"calib/mean_conf": 0.6639370078740157,
"calib/mu_c": 0.7745185185185184,
"calib/mu_w": 0.5384873949579833,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.14633858267716543,
"calib/std_conf": 0.30225180481232367,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6139563862928349,
"calib/step_q_c_n": 321.0,
"calib/step_q_gap": 0.10969521790795511,
"calib/step_q_w": 0.5042611683848798,
"calib/step_q_w_n": 291.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2180.0,
"completions/max_terminated_length": 2180.0,
"completions/mean_length": 456.72265625,
"completions/mean_terminated_length": 458.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.00633582565933466,
"kl": 0.056629180908203125,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0374,
"num_tokens": 19960161.0,
"reward": 0.43442198634147644,
"reward_std": 0.18732504546642303,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7422605752944946,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.17419779300689697,
"step": 86
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6566960218645612,
"calib/avg_num_step_conf": 2.45703125,
"calib/ece": 0.18626984126984128,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4642857142857143,
"calib/gap": 0.10433647130276336,
"calib/mean_conf": 0.793968253968254,
"calib/mu_c": 0.8246067415730336,
"calib/mu_w": 0.7202702702702702,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.13694444444444448,
"calib/std_conf": 0.2305981847320427,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6517913832199547,
"calib/step_q_c_n": 441.0,
"calib/step_q_gap": 0.11157861726250784,
"calib/step_q_w": 0.5402127659574468,
"calib/step_q_w_n": 188.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 416.15234375,
"completions/mean_terminated_length": 421.08697509765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.0928,
"grad_norm": 0.006363210268318653,
"kl": 0.05908203125,
"learning_rate": 3.138888888888889e-06,
"loss": 0.035,
"num_tokens": 20172192.0,
"reward": 0.4662480652332306,
"reward_std": 0.17796066403388977,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7597273588180542,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.16160625219345093,
"step": 87
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7926428571428571,
"calib/avg_num_step_conf": 2.234375,
"calib/ece": 0.13345098039215686,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.28195357142857136,
"calib/mean_conf": 0.7933725490196079,
"calib/mu_c": 0.8818285714285714,
"calib/mu_w": 0.599875,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12027450980392158,
"calib/std_conf": 0.26565272633863474,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6617391304347826,
"calib/step_q_c_n": 368.0,
"calib/step_q_gap": 0.18865089514066502,
"calib/step_q_w": 0.4730882352941176,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2193.0,
"completions/max_terminated_length": 2193.0,
"completions/mean_length": 440.140625,
"completions/mean_terminated_length": 440.140625,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.0057291858829557896,
"kl": 0.0485076904296875,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0265,
"num_tokens": 20394716.0,
"reward": 0.5039481520652771,
"reward_std": 0.183674156665802,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.8173171877861023,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.14457716047763824,
"step": 88
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7872451306103837,
"calib/avg_num_step_conf": 1.90625,
"calib/ece": 0.27201612903225797,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.43951612903225806,
"calib/gap": 0.21800664451827267,
"calib/mean_conf": 0.7491129032258066,
"calib/mu_c": 0.8537209302325584,
"calib/mu_w": 0.6357142857142857,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25048387096774183,
"calib/std_conf": 0.27707723418656127,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7401333333333332,
"calib/step_q_c_n": 225.0,
"calib/step_q_gap": 0.21549455006337115,
"calib/step_q_w": 0.524638783269962,
"calib/step_q_w_n": 263.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 494.5859375,
"completions/mean_terminated_length": 502.4365234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.005575645249336958,
"kl": 0.04261016845703125,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.0259,
"num_tokens": 20630218.0,
"reward": 0.4111081659793854,
"reward_std": 0.18856662511825562,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7072281241416931,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.17954307794570923,
"step": 89
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6472014260249553,
"calib/avg_num_step_conf": 3.0078125,
"calib/ece": 0.18515999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.54,
"calib/gap": 0.15693761140819973,
"calib/mean_conf": 0.80252,
"calib/mu_c": 0.8558787878787879,
"calib/mu_w": 0.6989411764705882,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16383999999999996,
"calib/std_conf": 0.2593585348508894,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6243856332703214,
"calib/step_q_c_n": 529.0,
"calib/step_q_gap": 0.08670928472260364,
"calib/step_q_w": 0.5376763485477177,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 476.01171875,
"completions/mean_terminated_length": 481.6561584472656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.096,
"grad_norm": 0.005511470139026642,
"kl": 0.050212860107421875,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0152,
"num_tokens": 20855397.0,
"reward": 0.4338238537311554,
"reward_std": 0.20431192219257355,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7368851900100708,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.19267499446868896,
"step": 90
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6859926470588235,
"calib/avg_num_step_conf": 2.53515625,
"calib/ece": 0.22027999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.592,
"calib/gap": 0.10050735294117663,
"calib/mean_conf": 0.83172,
"calib/mu_c": 0.8638823529411765,
"calib/mu_w": 0.7633749999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18599999999999997,
"calib/std_conf": 0.2354787497843489,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6749728260869565,
"calib/step_q_c_n": 368.0,
"calib/step_q_gap": 0.14499738124709882,
"calib/step_q_w": 0.5299754448398577,
"calib/step_q_w_n": 281.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2463.0,
"completions/max_terminated_length": 2463.0,
"completions/mean_length": 464.1875,
"completions/mean_terminated_length": 466.00787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.09468050301074982,
"kl": 0.8864479064941406,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0433,
"num_tokens": 21081941.0,
"reward": 0.4435563385486603,
"reward_std": 0.20487895607948303,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7295480370521545,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.16977913677692413,
"step": 91
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7207602339181286,
"calib/avg_num_step_conf": 2.40234375,
"calib/ece": 0.21690196078431362,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6588235294117647,
"calib/gap": 0.15466165413533828,
"calib/mean_conf": 0.838,
"calib/mu_c": 0.8889473684210526,
"calib/mu_w": 0.7342857142857143,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1921568627450979,
"calib/std_conf": 0.2520790414478262,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7228042328042329,
"calib/step_q_c_n": 378.0,
"calib/step_q_gap": 0.1458000133949502,
"calib/step_q_w": 0.5770042194092827,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1088.0,
"completions/max_terminated_length": 1088.0,
"completions/mean_length": 417.71484375,
"completions/mean_terminated_length": 419.35296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.006382127292454243,
"kl": 0.05200958251953125,
"learning_rate": 3e-06,
"loss": -0.0076,
"num_tokens": 21295596.0,
"reward": 0.4563332200050354,
"reward_std": 0.20540866255760193,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7529066801071167,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.17305275797843933,
"step": 92
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6466410373362872,
"calib/avg_num_step_conf": 2.78515625,
"calib/ece": 0.26186507936507925,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5793650793650794,
"calib/gap": 0.1254623053365479,
"calib/mean_conf": 0.8167063492063492,
"calib/mu_c": 0.867986577181208,
"calib/mu_w": 0.7425242718446601,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.24365079365079353,
"calib/std_conf": 0.2537728507956622,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7077556818181818,
"calib/step_q_c_n": 352.0,
"calib/step_q_gap": 0.14222474922353723,
"calib/step_q_w": 0.5655309325946446,
"calib/step_q_w_n": 361.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2864.0,
"completions/max_terminated_length": 2864.0,
"completions/mean_length": 479.4375,
"completions/mean_terminated_length": 479.4375,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0992,
"grad_norm": 0.006033513229340315,
"kl": 0.053585052490234375,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0558,
"num_tokens": 21524108.0,
"reward": 0.40865015983581543,
"reward_std": 0.25605103373527527,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6888468265533447,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.18404650688171387,
"step": 93
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6965517241379311,
"calib/avg_num_step_conf": 2.5546875,
"calib/ece": 0.2849402390438247,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6095617529880478,
"calib/gap": 0.13012296681847768,
"calib/mean_conf": 0.8539442231075699,
"calib/mu_c": 0.9088965517241381,
"calib/mu_w": 0.7787735849056604,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.28059760956175295,
"calib/std_conf": 0.2095812103732773,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7510869565217392,
"calib/step_q_c_n": 322.0,
"calib/step_q_gap": 0.19261858302776314,
"calib/step_q_w": 0.558468373493976,
"calib/step_q_w_n": 332.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2944.0,
"completions/max_terminated_length": 2944.0,
"completions/mean_length": 431.9453125,
"completions/mean_terminated_length": 431.9453125,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.0064744469709694386,
"kl": 0.056850433349609375,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0457,
"num_tokens": 21743366.0,
"reward": 0.39161694049835205,
"reward_std": 0.24791404604911804,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6812667846679688,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.2058454155921936,
"step": 94
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6644457135605051,
"calib/avg_num_step_conf": 2.171875,
"calib/ece": 0.2740625000000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.63671875,
"calib/gap": 0.13748461196776463,
"calib/mean_conf": 0.8322656250000001,
"calib/mu_c": 0.8875816993464053,
"calib/mu_w": 0.7500970873786407,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2543359375000001,
"calib/std_conf": 0.24190879881343585,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7588580246913581,
"calib/step_q_c_n": 324.0,
"calib/step_q_gap": 0.18584078331204779,
"calib/step_q_w": 0.5730172413793103,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1533.0,
"completions/max_terminated_length": 1533.0,
"completions/mean_length": 427.07421875,
"completions/mean_terminated_length": 428.7490539550781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.00599477207288146,
"kl": 0.05338287353515625,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0132,
"num_tokens": 21958825.0,
"reward": 0.4048115015029907,
"reward_std": 0.2196621149778366,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7120952606201172,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.22200357913970947,
"step": 95
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7623239436619718,
"calib/avg_num_step_conf": 2.2734375,
"calib/ece": 0.1780392156862746,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7372549019607844,
"calib/gap": 0.16595606246172678,
"calib/mean_conf": 0.8996078431372551,
"calib/mu_c": 0.9458152173913044,
"calib/mu_w": 0.7798591549295776,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1780392156862746,
"calib/std_conf": 0.17185904831432297,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8053508951406649,
"calib/step_q_c_n": 391.0,
"calib/step_q_gap": 0.22629330351762833,
"calib/step_q_w": 0.5790575916230366,
"calib/step_q_w_n": 191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2568.0,
"completions/max_terminated_length": 2568.0,
"completions/mean_length": 389.19140625,
"completions/mean_terminated_length": 389.19140625,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1024,
"grad_norm": 0.006308667361736298,
"kl": 0.062023162841796875,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0673,
"num_tokens": 22164274.0,
"reward": 0.4969533085823059,
"reward_std": 0.19700340926647186,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.8014000654220581,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.15046215057373047,
"step": 96
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6387672505307854,
"calib/avg_num_step_conf": 2.46875,
"calib/ece": 0.24383399209486162,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6245059288537549,
"calib/gap": 0.12807523885350303,
"calib/mean_conf": 0.8454150197628459,
"calib/mu_c": 0.8940127388535031,
"calib/mu_w": 0.7659375,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23434782608695648,
"calib/std_conf": 0.22668152009606526,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7184895833333332,
"calib/step_q_c_n": 384.0,
"calib/step_q_gap": 0.09064079301075256,
"calib/step_q_w": 0.6278487903225807,
"calib/step_q_w_n": 248.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2265.0,
"completions/max_terminated_length": 2265.0,
"completions/mean_length": 417.40234375,
"completions/mean_terminated_length": 420.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.006150359287858009,
"kl": 0.05887603759765625,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0149,
"num_tokens": 22376201.0,
"reward": 0.40991461277008057,
"reward_std": 0.21251186728477478,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7105230093002319,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.21022510528564453,
"step": 97
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6510576923076923,
"calib/avg_num_step_conf": 2.1171875,
"calib/ece": 0.2783070866141732,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6732283464566929,
"calib/gap": 0.15268333333333317,
"calib/mean_conf": 0.8364173228346456,
"calib/mu_c": 0.8989333333333331,
"calib/mu_w": 0.74625,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2620866141732283,
"calib/std_conf": 0.25232025459923296,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7754846416382253,
"calib/step_q_c_n": 293.0,
"calib/step_q_gap": 0.17363725207999225,
"calib/step_q_w": 0.601847389558233,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1677.0,
"completions/max_terminated_length": 1677.0,
"completions/mean_length": 432.28125,
"completions/mean_terminated_length": 432.28125,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.00583996158093214,
"kl": 0.0562286376953125,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0185,
"num_tokens": 22593049.0,
"reward": 0.39500892162323,
"reward_std": 0.22828838229179382,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6983347535133362,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.22237937152385712,
"step": 98
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6685130992196208,
"calib/avg_num_step_conf": 2.5078125,
"calib/ece": 0.3617355371900826,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5206611570247934,
"calib/gap": 0.16863154960981064,
"calib/mean_conf": 0.757107438016529,
"calib/mu_c": 0.8532692307692309,
"calib/mu_w": 0.6846376811594203,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.34454545454545454,
"calib/std_conf": 0.2961388941956455,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6900972,
"calib/step_q_c_n": 250.0,
"calib/step_q_gap": 0.15017219999999998,
"calib/step_q_w": 0.539925,
"calib/step_q_w_n": 392.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 471.0703125,
"completions/mean_terminated_length": 490.219482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.1056,
"grad_norm": 0.006908495910465717,
"kl": 0.0541229248046875,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0288,
"num_tokens": 22819443.0,
"reward": 0.30328214168548584,
"reward_std": 0.2629527449607849,
"rewards/accuracy_reward_step": 0.40625,
"rewards/final_brier_reward_step": 0.6006648540496826,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": -0.2628505229949951,
"step": 99
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6835552426065804,
"calib/avg_num_step_conf": 2.71484375,
"calib/ece": 0.25205533596837937,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6047430830039525,
"calib/gap": 0.1741339137114326,
"calib/mean_conf": 0.8114229249011857,
"calib/mu_c": 0.8850684931506849,
"calib/mu_w": 0.7109345794392523,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2432015810276679,
"calib/std_conf": 0.25954115598178634,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6737652811735941,
"calib/step_q_c_n": 409.0,
"calib/step_q_gap": 0.09583171474002761,
"calib/step_q_w": 0.5779335664335665,
"calib/step_q_w_n": 286.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 455.66015625,
"completions/mean_terminated_length": 455.66015625,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.006140739191323519,
"kl": 0.05904388427734375,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0308,
"num_tokens": 23043500.0,
"reward": 0.400008887052536,
"reward_std": 0.22616682946681976,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7102363109588623,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.22193726897239685,
"step": 100
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7193985996499124,
"calib/avg_num_step_conf": 2.7734375,
"calib/ece": 0.33383399209486164,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6126482213438735,
"calib/gap": 0.16348462115528872,
"calib/mean_conf": 0.8145454545454545,
"calib/mu_c": 0.8979032258064515,
"calib/mu_w": 0.7344186046511628,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.32913043478260867,
"calib/std_conf": 0.2489287143960703,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6824866449511401,
"calib/step_q_c_n": 307.0,
"calib/step_q_gap": 0.12615910152682241,
"calib/step_q_w": 0.5563275434243177,
"calib/step_q_w_n": 403.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1582.0,
"completions/max_terminated_length": 1582.0,
"completions/mean_length": 467.17578125,
"completions/mean_terminated_length": 469.00787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.005982357542961836,
"kl": 0.059112548828125,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0133,
"num_tokens": 23270089.0,
"reward": 0.35126781463623047,
"reward_std": 0.25247329473495483,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6566468477249146,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.24786126613616943,
"step": 101
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7154881060704537,
"calib/avg_num_step_conf": 3.0078125,
"calib/ece": 0.2201176470588235,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5647058823529412,
"calib/gap": 0.18852398284154415,
"calib/mean_conf": 0.8247450980392158,
"calib/mu_c": 0.8971974522292993,
"calib/mu_w": 0.7086734693877551,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21458823529411764,
"calib/std_conf": 0.23791379581831032,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.715117994100295,
"calib/step_q_c_n": 452.0,
"calib/step_q_gap": 0.15436013246507474,
"calib/step_q_w": 0.5607578616352202,
"calib/step_q_w_n": 318.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1331.0,
"completions/max_terminated_length": 1331.0,
"completions/mean_length": 383.58984375,
"completions/mean_terminated_length": 385.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1088,
"grad_norm": 0.006579132750630379,
"kl": 0.07131195068359375,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.025,
"num_tokens": 23474984.0,
"reward": 0.43535923957824707,
"reward_std": 0.1856844127178192,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.749351978302002,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.2005085051059723,
"step": 102
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6476994372724263,
"calib/avg_num_step_conf": 2.49609375,
"calib/ece": 0.2566929133858268,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5551181102362205,
"calib/gap": 0.11292287322078787,
"calib/mean_conf": 0.8185826771653544,
"calib/mu_c": 0.8608176100628931,
"calib/mu_w": 0.7478947368421053,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22464566929133858,
"calib/std_conf": 0.2499644838061726,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7548011363636365,
"calib/step_q_c_n": 352.0,
"calib/step_q_gap": 0.20926106667722544,
"calib/step_q_w": 0.5455400696864111,
"calib/step_q_w_n": 287.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 466.2734375,
"completions/mean_terminated_length": 466.2734375,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.0054487548768520355,
"kl": 0.05999755859375,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0057,
"num_tokens": 23698902.0,
"reward": 0.4195878803730011,
"reward_std": 0.216641366481781,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7135539054870605,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.19703443348407745,
"step": 103
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6941060661990894,
"calib/avg_num_step_conf": 3.2734375,
"calib/ece": 0.2768627450980392,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.43529411764705883,
"calib/gap": 0.17975636766334446,
"calib/mean_conf": 0.751921568627451,
"calib/mu_c": 0.8428571428571429,
"calib/mu_w": 0.6631007751937984,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2673333333333333,
"calib/std_conf": 0.2707546348262433,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6844289044289045,
"calib/step_q_c_n": 429.0,
"calib/step_q_gap": 0.1231086110303714,
"calib/step_q_w": 0.5613202933985331,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1167.0,
"completions/max_terminated_length": 1167.0,
"completions/mean_length": 450.63671875,
"completions/mean_terminated_length": 452.4039611816406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.006285542622208595,
"kl": 0.06972503662109375,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0145,
"num_tokens": 23920945.0,
"reward": 0.38449519872665405,
"reward_std": 0.21741212904453278,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6973944902420044,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.22606037557125092,
"step": 104
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6722027972027972,
"calib/avg_num_step_conf": 3.4921875,
"calib/ece": 0.2310588235294118,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4627450980392157,
"calib/gap": 0.17593531468531465,
"calib/mean_conf": 0.7574117647058823,
"calib/mu_c": 0.8346853146853147,
"calib/mu_w": 0.6587500000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21384313725490198,
"calib/std_conf": 0.2732526789040839,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6315923566878981,
"calib/step_q_c_n": 471.0,
"calib/step_q_gap": 0.126320489075605,
"calib/step_q_w": 0.5052718676122931,
"calib/step_q_w_n": 423.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1407.0,
"completions/max_terminated_length": 1407.0,
"completions/mean_length": 467.0078125,
"completions/mean_terminated_length": 468.8392333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.112,
"grad_norm": 0.00584400026127696,
"kl": 0.0659332275390625,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.019,
"num_tokens": 24146259.0,
"reward": 0.413488507270813,
"reward_std": 0.2676308751106262,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.718665599822998,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.2010636329650879,
"step": 105
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7931652414486923,
"calib/avg_num_step_conf": 3.0078125,
"calib/ece": 0.23413385826771643,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5393700787401575,
"calib/gap": 0.2596755533199194,
"calib/mean_conf": 0.7857086614173228,
"calib/mu_c": 0.9002112676056336,
"calib/mu_w": 0.6405357142857142,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23039370078740148,
"calib/std_conf": 0.2775328769695504,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7003703703703702,
"calib/step_q_c_n": 378.0,
"calib/step_q_gap": 0.18276832955404376,
"calib/step_q_w": 0.5176020408163264,
"calib/step_q_w_n": 392.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2464.0,
"completions/max_terminated_length": 2464.0,
"completions/mean_length": 443.4296875,
"completions/mean_terminated_length": 445.16864013671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.005779189057648182,
"kl": 0.0670166015625,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0032,
"num_tokens": 24364361.0,
"reward": 0.4434106647968292,
"reward_std": 0.17971467971801758,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7472339868545532,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.1697876900434494,
"step": 106
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6255549121791161,
"calib/avg_num_step_conf": 3.34375,
"calib/ece": 0.2551953124999998,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.53515625,
"calib/gap": 0.07818374831113672,
"calib/mean_conf": 0.7985546875,
"calib/mu_c": 0.8287898089171973,
"calib/mu_w": 0.7506060606060606,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22023437499999984,
"calib/std_conf": 0.2446566889475482,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6505349794238683,
"calib/step_q_c_n": 486.0,
"calib/step_q_gap": 0.0916160605049493,
"calib/step_q_w": 0.558918918918919,
"calib/step_q_w_n": 370.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1247.0,
"completions/max_terminated_length": 1247.0,
"completions/mean_length": 430.19921875,
"completions/mean_terminated_length": 431.88629150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.0060838377103209496,
"kl": 0.075714111328125,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0091,
"num_tokens": 24579108.0,
"reward": 0.4143851101398468,
"reward_std": 0.19063109159469604,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7057347297668457,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.19962078332901,
"step": 107
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6504048904414099,
"calib/avg_num_step_conf": 3.1953125,
"calib/ece": 0.17356862745098045,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5686274509803921,
"calib/gap": 0.10877500793902806,
"calib/mean_conf": 0.8009411764705883,
"calib/mu_c": 0.8295212765957446,
"calib/mu_w": 0.7207462686567165,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11862745098039224,
"calib/std_conf": 0.2620984257394281,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6984483430799221,
"calib/step_q_c_n": 513.0,
"calib/step_q_gap": 0.1552352283258237,
"calib/step_q_w": 0.5432131147540984,
"calib/step_q_w_n": 305.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2590.0,
"completions/max_terminated_length": 2590.0,
"completions/mean_length": 453.625,
"completions/mean_terminated_length": 453.625,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.1152,
"grad_norm": 0.0059150392189621925,
"kl": 0.07221221923828125,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0307,
"num_tokens": 24798468.0,
"reward": 0.4753592908382416,
"reward_std": 0.20767410099506378,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7676601409912109,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.16147282719612122,
"step": 108
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7909703091521274,
"calib/avg_num_step_conf": 3.4453125,
"calib/ece": 0.26847656250000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.390625,
"calib/gap": 0.2549207223752681,
"calib/mean_conf": 0.7273046875,
"calib/mu_c": 0.8617355371900828,
"calib/mu_w": 0.6068148148148147,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26156250000000003,
"calib/std_conf": 0.27908360246622754,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6148711340206187,
"calib/step_q_c_n": 388.0,
"calib/step_q_gap": 0.13713834049835144,
"calib/step_q_w": 0.4777327935222672,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1371.0,
"completions/max_terminated_length": 1371.0,
"completions/mean_length": 456.94140625,
"completions/mean_terminated_length": 458.7333679199219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.005637164227664471,
"kl": 0.07299041748046875,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0214,
"num_tokens": 25020045.0,
"reward": 0.4240063428878784,
"reward_std": 0.17739006876945496,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.735093355178833,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.18161195516586304,
"step": 109
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6757722007722008,
"calib/avg_num_step_conf": 3.3984375,
"calib/ece": 0.17805533596837952,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.383399209486166,
"calib/gap": 0.18335971685971686,
"calib/mean_conf": 0.7340237154150197,
"calib/mu_c": 0.8101216216216216,
"calib/mu_w": 0.6267619047619047,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16354940711462457,
"calib/std_conf": 0.2678300140641943,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6310274261603375,
"calib/step_q_c_n": 474.0,
"calib/step_q_gap": 0.12784560797851918,
"calib/step_q_w": 0.5031818181818183,
"calib/step_q_w_n": 396.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 429.2734375,
"completions/mean_terminated_length": 430.9568786621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.0061676702462136745,
"kl": 0.07923126220703125,
"learning_rate": 2.5e-06,
"loss": 0.0147,
"num_tokens": 25234859.0,
"reward": 0.42947834730148315,
"reward_std": 0.21718741953372955,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7396499514579773,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.19319328665733337,
"step": 110
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7256516211061665,
"calib/avg_num_step_conf": 2.921875,
"calib/ece": 0.18086956521739125,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.43873517786561267,
"calib/gap": 0.2383286713286712,
"calib/mean_conf": 0.7358893280632411,
"calib/mu_c": 0.8395104895104893,
"calib/mu_w": 0.6011818181818182,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17577075098814227,
"calib/std_conf": 0.29037882372774526,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6409819121447029,
"calib/step_q_c_n": 387.0,
"calib/step_q_gap": 0.15715919746326246,
"calib/step_q_w": 0.48382271468144045,
"calib/step_q_w_n": 361.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1609.0,
"completions/max_terminated_length": 1609.0,
"completions/mean_length": 439.7578125,
"completions/mean_terminated_length": 441.4823913574219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1184,
"grad_norm": 0.006372205447405577,
"kl": 0.0758514404296875,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0532,
"num_tokens": 25454845.0,
"reward": 0.44394993782043457,
"reward_std": 0.1870567500591278,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7490593791007996,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.17053453624248505,
"step": 111
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7689470365699873,
"calib/avg_num_step_conf": 2.81640625,
"calib/ece": 0.16642857142857145,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.32142857142857145,
"calib/gap": 0.2834804539722572,
"calib/mean_conf": 0.6296825396825397,
"calib/mu_c": 0.7669230769230769,
"calib/mu_w": 0.48344262295081974,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14011904761904764,
"calib/std_conf": 0.31763745058257475,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6238775510204082,
"calib/step_q_c_n": 294.0,
"calib/step_q_gap": 0.224135162261626,
"calib/step_q_w": 0.3997423887587822,
"calib/step_q_w_n": 427.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 480.234375,
"completions/mean_terminated_length": 480.234375,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.006004595663398504,
"kl": 0.0638885498046875,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0478,
"num_tokens": 25685705.0,
"reward": 0.44127148389816284,
"reward_std": 0.2001304030418396,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7588914036750793,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.17322342097759247,
"step": 112
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8027427531522163,
"calib/avg_num_step_conf": 2.78515625,
"calib/ece": 0.1334117647058823,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4235294117647059,
"calib/gap": 0.3040881320681138,
"calib/mean_conf": 0.7180392156862746,
"calib/mu_c": 0.834904458598726,
"calib/mu_w": 0.5308163265306122,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11788235294117644,
"calib/std_conf": 0.29488861046126724,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.60359375,
"calib/step_q_c_n": 384.0,
"calib/step_q_gap": 0.18702840045592717,
"calib/step_q_w": 0.41656534954407287,
"calib/step_q_w_n": 329.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 398.43359375,
"completions/mean_terminated_length": 398.43359375,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.006771065294742584,
"kl": 0.08171844482421875,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0509,
"num_tokens": 25892904.0,
"reward": 0.49621909856796265,
"reward_std": 0.1860068142414093,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.8066890239715576,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.1361258625984192,
"step": 113
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8219507242452957,
"calib/avg_num_step_conf": 2.79296875,
"calib/ece": 0.1100392156862745,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.403921568627451,
"calib/gap": 0.3354501150670095,
"calib/mean_conf": 0.7237647058823531,
"calib/mu_c": 0.8408433734939758,
"calib/mu_w": 0.5053932584269663,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09141176470588235,
"calib/std_conf": 0.2902006464929282,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6508450704225351,
"calib/step_q_c_n": 426.0,
"calib/step_q_gap": 0.2275924752668258,
"calib/step_q_w": 0.42325259515570934,
"calib/step_q_w_n": 289.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 414.73046875,
"completions/mean_terminated_length": 414.73046875,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.1216,
"grad_norm": 0.006451824214309454,
"kl": 0.068511962890625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.006,
"num_tokens": 26104099.0,
"reward": 0.5097230076789856,
"reward_std": 0.17535501718521118,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.8283358812332153,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.1362336277961731,
"step": 114
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6537506367804381,
"calib/avg_num_step_conf": 2.5625,
"calib/ece": 0.195593725490196,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.39215686274509803,
"calib/gap": 0.16332058074375955,
"calib/mean_conf": 0.6910729411764706,
"calib/mu_c": 0.757682119205298,
"calib/mu_w": 0.5943615384615385,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14725490196078425,
"calib/std_conf": 0.30040811066629775,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5597662337662338,
"calib/step_q_c_n": 385.0,
"calib/step_q_gap": 0.08283449944889049,
"calib/step_q_w": 0.4769317343173433,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1153.0,
"completions/max_terminated_length": 1153.0,
"completions/mean_length": 382.875,
"completions/mean_terminated_length": 384.3764953613281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.006533203646540642,
"kl": 0.0797576904296875,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0256,
"num_tokens": 26307379.0,
"reward": 0.42707669734954834,
"reward_std": 0.19787093997001648,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7344694137573242,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.1975034773349762,
"step": 115
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7142095022908429,
"calib/avg_num_step_conf": 2.83203125,
"calib/ece": 0.21039370078740158,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3779527559055118,
"calib/gap": 0.17867947028180498,
"calib/mean_conf": 0.7144094488188977,
"calib/mu_c": 0.7939007092198581,
"calib/mu_w": 0.6152212389380531,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18484251968503934,
"calib/std_conf": 0.28248518473675605,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6328875379939211,
"calib/step_q_c_n": 329.0,
"calib/step_q_gap": 0.184056729913113,
"calib/step_q_w": 0.44883080808080805,
"calib/step_q_w_n": 396.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1376.0,
"completions/max_terminated_length": 1376.0,
"completions/mean_length": 446.05859375,
"completions/mean_terminated_length": 447.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.006134877912700176,
"kl": 0.06531524658203125,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.019,
"num_tokens": 26526090.0,
"reward": 0.4186154007911682,
"reward_std": 0.2001335471868515,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.730369508266449,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.20173242688179016,
"step": 116
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7597947761194029,
"calib/avg_num_step_conf": 2.48828125,
"calib/ece": 0.2484645669291338,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.32677165354330706,
"calib/gap": 0.23092910447761184,
"calib/mean_conf": 0.7149212598425199,
"calib/mu_c": 0.83675,
"calib/mu_w": 0.6058208955223882,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24547244094488183,
"calib/std_conf": 0.2675887403413453,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6150537634408603,
"calib/step_q_c_n": 279.0,
"calib/step_q_gap": 0.11784705953024577,
"calib/step_q_w": 0.4972067039106145,
"calib/step_q_w_n": 358.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2332.0,
"completions/max_terminated_length": 2332.0,
"completions/mean_length": 427.9453125,
"completions/mean_terminated_length": 427.9453125,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1248,
"grad_norm": 0.005825493484735489,
"kl": 0.06906890869140625,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0049,
"num_tokens": 26742244.0,
"reward": 0.40940552949905396,
"reward_std": 0.20684045553207397,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7297269701957703,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.20310339331626892,
"step": 117
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.714480182347727,
"calib/avg_num_step_conf": 3.05859375,
"calib/ece": 0.14988235294117655,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3843137254901961,
"calib/gap": 0.21608079017348358,
"calib/mean_conf": 0.7328627450980392,
"calib/mu_c": 0.8226845637583894,
"calib/mu_w": 0.6066037735849058,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14921568627450987,
"calib/std_conf": 0.27010971043428966,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5166139954853274,
"calib/step_q_c_n": 443.0,
"calib/step_q_gap": 0.05520223077944503,
"calib/step_q_w": 0.46141176470588235,
"calib/step_q_w_n": 340.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1836.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 437.41796875,
"completions/mean_terminated_length": 439.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.005717393942177296,
"kl": 0.069976806640625,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0478,
"num_tokens": 26958231.0,
"reward": 0.43301814794540405,
"reward_std": 0.18237316608428955,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7607734203338623,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.20958086848258972,
"step": 118
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7422657952069718,
"calib/avg_num_step_conf": 2.39453125,
"calib/ece": 0.2096456692913386,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": 0.24138748832866486,
"calib/mean_conf": 0.708464566929134,
"calib/mu_c": 0.8215555555555556,
"calib/mu_w": 0.5801680672268907,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19330708661417328,
"calib/std_conf": 0.30811658738481257,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6629113924050634,
"calib/step_q_c_n": 237.0,
"calib/step_q_gap": 0.2757039455965527,
"calib/step_q_w": 0.38720744680851066,
"calib/step_q_w_n": 376.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 458.72265625,
"completions/mean_terminated_length": 460.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.005962693598121405,
"kl": 0.0665283203125,
"learning_rate": 2.25e-06,
"loss": 0.0114,
"num_tokens": 27180728.0,
"reward": 0.43463796377182007,
"reward_std": 0.2322581708431244,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7391331791877747,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.17376351356506348,
"step": 119
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7753120224146716,
"calib/avg_num_step_conf": 2.40625,
"calib/ece": 0.15745098039215694,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.43529411764705883,
"calib/gap": 0.2925458481915436,
"calib/mean_conf": 0.7361176470588237,
"calib/mu_c": 0.855430463576159,
"calib/mu_w": 0.5628846153846154,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15070588235294125,
"calib/std_conf": 0.3018352782210479,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6148901098901098,
"calib/step_q_c_n": 364.0,
"calib/step_q_gap": 0.12766788766788761,
"calib/step_q_w": 0.4872222222222222,
"calib/step_q_w_n": 252.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2082.0,
"completions/max_terminated_length": 2082.0,
"completions/mean_length": 404.77734375,
"completions/mean_terminated_length": 404.77734375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.128,
"grad_norm": 0.006345272064208984,
"kl": 0.071075439453125,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0323,
"num_tokens": 27391039.0,
"reward": 0.47319233417510986,
"reward_std": 0.17949917912483215,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7807586193084717,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.14999887347221375,
"step": 120
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6862980769230769,
"calib/avg_num_step_conf": 2.265625,
"calib/ece": 0.22480468749999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.1558653846153848,
"calib/mean_conf": 0.7729296875,
"calib/mu_c": 0.83625,
"calib/mu_w": 0.6803846153846153,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20199218749999995,
"calib/std_conf": 0.2713587248572493,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6637457044673539,
"calib/step_q_c_n": 291.0,
"calib/step_q_gap": 0.1868252892424404,
"calib/step_q_w": 0.4769204152249135,
"calib/step_q_w_n": 289.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1338.0,
"completions/max_terminated_length": 1338.0,
"completions/mean_length": 426.74609375,
"completions/mean_terminated_length": 428.4196472167969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.006708329077810049,
"kl": 0.067962646484375,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0081,
"num_tokens": 27605342.0,
"reward": 0.42762500047683716,
"reward_std": 0.23545682430267334,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7204293012619019,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.182366743683815,
"step": 121
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7024892759651632,
"calib/avg_num_step_conf": 2.265625,
"calib/ece": 0.18407843137254898,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5764705882352941,
"calib/gap": 0.21602495775380204,
"calib/mean_conf": 0.7934117647058825,
"calib/mu_c": 0.8764331210191082,
"calib/mu_w": 0.6604081632653062,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18090196078431367,
"calib/std_conf": 0.275026784896692,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6653571428571429,
"calib/step_q_c_n": 308.0,
"calib/step_q_gap": 0.2497321428571429,
"calib/step_q_w": 0.41562499999999997,
"calib/step_q_w_n": 272.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2160.0,
"completions/max_terminated_length": 2160.0,
"completions/mean_length": 409.15234375,
"completions/mean_terminated_length": 409.15234375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.0067081572487950325,
"kl": 0.08014678955078125,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0015,
"num_tokens": 27817429.0,
"reward": 0.4447178244590759,
"reward_std": 0.26026856899261475,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.748012900352478,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.17888976633548737,
"step": 122
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6532738095238096,
"calib/avg_num_step_conf": 2.5546875,
"calib/ece": 0.25507874015748033,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": 0.16622519841269845,
"calib/mean_conf": 0.7196456692913387,
"calib/mu_c": 0.8034126984126985,
"calib/mu_w": 0.6371875,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2393307086614173,
"calib/std_conf": 0.29290607528342233,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5598722044728435,
"calib/step_q_c_n": 313.0,
"calib/step_q_gap": 0.11740886136433909,
"calib/step_q_w": 0.4424633431085044,
"calib/step_q_w_n": 341.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 486.5703125,
"completions/mean_terminated_length": 488.47845458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.1312,
"grad_norm": 0.00602144468575716,
"kl": 0.06238555908203125,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0127,
"num_tokens": 28047279.0,
"reward": 0.38009756803512573,
"reward_std": 0.2720668613910675,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6759315729141235,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.2079240083694458,
"step": 123
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7034875966924513,
"calib/avg_num_step_conf": 2.30859375,
"calib/ece": 0.18203921568627454,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4980392156862745,
"calib/gap": 0.17056881835156024,
"calib/mean_conf": 0.7839215686274511,
"calib/mu_c": 0.8454601226993863,
"calib/mu_w": 0.6748913043478261,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16337254901960788,
"calib/std_conf": 0.2511673017260864,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6243681318681319,
"calib/step_q_c_n": 364.0,
"calib/step_q_gap": 0.09388355037033458,
"calib/step_q_w": 0.5304845814977973,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1061.0,
"completions/max_terminated_length": 1061.0,
"completions/mean_length": 403.75390625,
"completions/mean_terminated_length": 405.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.006141428370028734,
"kl": 0.0720672607421875,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0239,
"num_tokens": 28257456.0,
"reward": 0.44975244998931885,
"reward_std": 0.19885006546974182,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7609667778015137,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.1880244016647339,
"step": 124
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6631318818818819,
"calib/avg_num_step_conf": 1.9375,
"calib/ece": 0.21301960784313734,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.47058823529411764,
"calib/gap": 0.18230480480480493,
"calib/mean_conf": 0.7492549019607844,
"calib/mu_c": 0.8286111111111112,
"calib/mu_w": 0.6463063063063063,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19878431372549027,
"calib/std_conf": 0.2917127141110068,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6556554307116105,
"calib/step_q_c_n": 267.0,
"calib/step_q_gap": 0.09836285429239644,
"calib/step_q_w": 0.557292576419214,
"calib/step_q_w_n": 229.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1693.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 439.15234375,
"completions/mean_terminated_length": 440.8745422363281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.006332618184387684,
"kl": 0.06296539306640625,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0098,
"num_tokens": 28474687.0,
"reward": 0.4046638011932373,
"reward_std": 0.2163892388343811,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7109558582305908,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.21100321412086487,
"step": 125
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7924464417631127,
"calib/avg_num_step_conf": 2.453125,
"calib/ece": 0.20345098039215695,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5098039215686274,
"calib/gap": 0.35185360748584105,
"calib/mean_conf": 0.7103529411764706,
"calib/mu_c": 0.8814503816793895,
"calib/mu_w": 0.5295967741935484,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20003921568627459,
"calib/std_conf": 0.33382905238585764,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6417880794701988,
"calib/step_q_c_n": 302.0,
"calib/step_q_gap": 0.23789237394872637,
"calib/step_q_w": 0.4038957055214724,
"calib/step_q_w_n": 326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2490.0,
"completions/max_terminated_length": 2490.0,
"completions/mean_length": 437.0,
"completions/mean_terminated_length": 437.0,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.1344,
"grad_norm": 0.005750678479671478,
"kl": 0.0669403076171875,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0252,
"num_tokens": 28692023.0,
"reward": 0.4464186429977417,
"reward_std": 0.19056807458400726,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7728476524353027,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.1815728098154068,
"step": 126
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7651209677419355,
"calib/avg_num_step_conf": 1.98046875,
"calib/ece": 0.250952380952381,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49603174603174605,
"calib/gap": 0.291938004032258,
"calib/mean_conf": 0.7237301587301587,
"calib/mu_c": 0.872016129032258,
"calib/mu_w": 0.580078125,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.24130952380952386,
"calib/std_conf": 0.3258486700250864,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6589912280701754,
"calib/step_q_c_n": 228.0,
"calib/step_q_gap": 0.18368656857196758,
"calib/step_q_w": 0.4753046594982078,
"calib/step_q_w_n": 279.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1938.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 395.6875,
"completions/mean_terminated_length": 397.2392272949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.006842640228569508,
"kl": 0.076812744140625,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0037,
"num_tokens": 28896991.0,
"reward": 0.40660762786865234,
"reward_std": 0.22128602862358093,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7246460914611816,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.20596206188201904,
"step": 127
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6736004108885464,
"calib/avg_num_step_conf": 1.91015625,
"calib/ece": 0.23653999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.44,
"calib/gap": 0.2019042116076012,
"calib/mean_conf": 0.70758,
"calib/mu_c": 0.8028787878787879,
"calib/mu_w": 0.6009745762711867,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.20806,
"calib/std_conf": 0.31591366478834054,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6228517110266161,
"calib/step_q_c_n": 263.0,
"calib/step_q_gap": 0.14678976412396116,
"calib/step_q_w": 0.4760619469026549,
"calib/step_q_w_n": 226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2831.0,
"completions/max_terminated_length": 2831.0,
"completions/mean_length": 444.1015625,
"completions/mean_terminated_length": 447.5984191894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.007062236778438091,
"kl": 0.06587600708007812,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0319,
"num_tokens": 29117345.0,
"reward": 0.39006370306015015,
"reward_std": 0.22584298253059387,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6938701868057251,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.20983658730983734,
"step": 128
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6701629395078832,
"calib/avg_num_step_conf": 1.9453125,
"calib/ece": 0.197421875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.5625,
"calib/gap": 0.14036941750775134,
"calib/mean_conf": 0.8085156250000001,
"calib/mu_c": 0.8595092024539879,
"calib/mu_w": 0.7191397849462365,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18460937500000002,
"calib/std_conf": 0.24441125000879024,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.692340425531915,
"calib/step_q_c_n": 329.0,
"calib/step_q_gap": 0.048612614881027416,
"calib/step_q_w": 0.6437278106508876,
"calib/step_q_w_n": 169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1371.0,
"completions/max_terminated_length": 1371.0,
"completions/mean_length": 362.53515625,
"completions/mean_terminated_length": 363.9568786621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.1376,
"grad_norm": 0.006954402197152376,
"kl": 0.073272705078125,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.0052,
"num_tokens": 29312538.0,
"reward": 0.43850159645080566,
"reward_std": 0.20004130899906158,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7443780899047852,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.19471865892410278,
"step": 129
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7535458642629905,
"calib/avg_num_step_conf": 1.90234375,
"calib/ece": 0.19234374999999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.59765625,
"calib/gap": 0.25480381760339355,
"calib/mean_conf": 0.803125,
"calib/mu_c": 0.8946951219512196,
"calib/mu_w": 0.639891304347826,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17742187499999995,
"calib/std_conf": 0.2773232826954852,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7463197026022306,
"calib/step_q_c_n": 269.0,
"calib/step_q_gap": 0.22274172095085432,
"calib/step_q_w": 0.5235779816513763,
"calib/step_q_w_n": 218.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1276.0,
"completions/max_terminated_length": 1276.0,
"completions/mean_length": 374.10546875,
"completions/mean_terminated_length": 375.57257080078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.006909600459039211,
"kl": 0.0675201416015625,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0234,
"num_tokens": 29513597.0,
"reward": 0.47459420561790466,
"reward_std": 0.1551002562046051,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7759792804718018,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.15335342288017273,
"step": 130
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7306662515566624,
"calib/avg_num_step_conf": 2.4140625,
"calib/ece": 0.28429687499999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.3828125,
"calib/gap": 0.23855541718555395,
"calib/mean_conf": 0.7003125,
"calib/mu_c": 0.8363636363636362,
"calib/mu_w": 0.5978082191780822,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27746093749999995,
"calib/std_conf": 0.29775970151071485,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6309505703422054,
"calib/step_q_c_n": 263.0,
"calib/step_q_gap": 0.18106324639854338,
"calib/step_q_w": 0.449887323943662,
"calib/step_q_w_n": 355.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 925.0,
"completions/max_terminated_length": 925.0,
"completions/mean_length": 368.26953125,
"completions/mean_terminated_length": 369.7137451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.006882078945636749,
"kl": 0.076446533203125,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0246,
"num_tokens": 29714082.0,
"reward": 0.38815802335739136,
"reward_std": 0.21172058582305908,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.7099640369415283,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.21958552300930023,
"step": 131
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7970082577501015,
"calib/avg_num_step_conf": 2.359375,
"calib/ece": 0.1489372549019608,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5450980392156862,
"calib/gap": 0.30514627047515885,
"calib/mean_conf": 0.773250980392157,
"calib/mu_c": 0.8797530120481927,
"calib/mu_w": 0.5746067415730338,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1356039215686275,
"calib/std_conf": 0.2877999437058285,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7027909090909091,
"calib/step_q_c_n": 330.0,
"calib/step_q_gap": 0.2830098871930989,
"calib/step_q_w": 0.41978102189781025,
"calib/step_q_w_n": 274.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1128.0,
"completions/max_terminated_length": 1128.0,
"completions/mean_length": 421.4296875,
"completions/mean_terminated_length": 423.0823669433594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.1408,
"grad_norm": 0.006782933603972197,
"kl": 0.067291259765625,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0141,
"num_tokens": 29927560.0,
"reward": 0.4930591583251953,
"reward_std": 0.19035959243774414,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7980226278305054,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.1369042992591858,
"step": 132
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7284993144708961,
"calib/avg_num_step_conf": 2.109375,
"calib/ece": 0.24494117647058822,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.34901960784313724,
"calib/gap": 0.24423656986164777,
"calib/mean_conf": 0.6661176470588236,
"calib/mu_c": 0.8021238938053097,
"calib/mu_w": 0.5578873239436619,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23396078431372547,
"calib/std_conf": 0.3196263533675105,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6283255813953488,
"calib/step_q_c_n": 215.0,
"calib/step_q_gap": 0.18069481216457967,
"calib/step_q_w": 0.4476307692307691,
"calib/step_q_w_n": 325.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 488.4453125,
"completions/mean_terminated_length": 488.4453125,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.005817534402012825,
"kl": 0.053134918212890625,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.003,
"num_tokens": 30158946.0,
"reward": 0.39914774894714355,
"reward_std": 0.23423807322978973,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7104480266571045,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.19730885326862335,
"step": 133
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7303877940241577,
"calib/avg_num_step_conf": 1.8125,
"calib/ece": 0.18675889328063244,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.47035573122529645,
"calib/gap": 0.24748251748251726,
"calib/mean_conf": 0.7198814229249013,
"calib/mu_c": 0.8274825174825173,
"calib/mu_w": 0.5800000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1707114624505929,
"calib/std_conf": 0.31804054507408136,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6810256410256411,
"calib/step_q_c_n": 234.0,
"calib/step_q_gap": 0.18351564102564116,
"calib/step_q_w": 0.49750999999999995,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 491.453125,
"completions/mean_terminated_length": 493.38043212890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.005839070305228233,
"kl": 0.05161285400390625,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0228,
"num_tokens": 30393710.0,
"reward": 0.4265187382698059,
"reward_std": 0.2271723449230194,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7398152351379395,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.19537153840065002,
"step": 134
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6970262535772054,
"calib/avg_num_step_conf": 2.1328125,
"calib/ece": 0.23145098039215684,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4549019607843137,
"calib/gap": 0.19408361328854062,
"calib/mean_conf": 0.707843137254902,
"calib/mu_c": 0.7946099290780142,
"calib/mu_w": 0.6005263157894736,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19317647058823526,
"calib/std_conf": 0.3222598739751373,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6618283582089552,
"calib/step_q_c_n": 268.0,
"calib/step_q_gap": 0.18118087619456674,
"calib/step_q_w": 0.4806474820143885,
"calib/step_q_w_n": 278.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2213.0,
"completions/max_terminated_length": 2213.0,
"completions/mean_length": 442.09375,
"completions/mean_terminated_length": 442.09375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.144,
"grad_norm": 0.006755627226084471,
"kl": 0.06070709228515625,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0245,
"num_tokens": 30612766.0,
"reward": 0.4173700213432312,
"reward_std": 0.20016345381736755,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7141886949539185,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.18804235756397247,
"step": 135
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8194513099357389,
"calib/avg_num_step_conf": 2.6796875,
"calib/ece": 0.21752941176470592,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.43529411764705883,
"calib/gap": 0.36043067226890735,
"calib/mean_conf": 0.6706274509803922,
"calib/mu_c": 0.8628571428571427,
"calib/mu_w": 0.5024264705882353,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21074509803921573,
"calib/std_conf": 0.33725593707091495,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6736032388663967,
"calib/step_q_c_n": 247.0,
"calib/step_q_gap": 0.30733900196434655,
"calib/step_q_w": 0.3662642369020502,
"calib/step_q_w_n": 439.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1385.0,
"completions/max_terminated_length": 1385.0,
"completions/mean_length": 412.2421875,
"completions/mean_terminated_length": 413.8588562011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.007109965663403273,
"kl": 0.06912994384765625,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0012,
"num_tokens": 30826788.0,
"reward": 0.4410070478916168,
"reward_std": 0.16643013060092926,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7721558809280396,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.1823292225599289,
"step": 136
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7205128205128205,
"calib/avg_num_step_conf": 2.44921875,
"calib/ece": 0.2232539682539683,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5357142857142857,
"calib/gap": 0.2512706552706552,
"calib/mean_conf": 0.7311904761904762,
"calib/mu_c": 0.8478518518518519,
"calib/mu_w": 0.5965811965811967,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2093650793650794,
"calib/std_conf": 0.32522545816626314,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6836758893280633,
"calib/step_q_c_n": 253.0,
"calib/step_q_gap": 0.3525528946756569,
"calib/step_q_w": 0.3311229946524064,
"calib/step_q_w_n": 374.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 441.5546875,
"completions/mean_terminated_length": 443.28631591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.006158465053886175,
"kl": 0.06252288818359375,
"learning_rate": 1.75e-06,
"loss": -0.0077,
"num_tokens": 31046810.0,
"reward": 0.4185337424278259,
"reward_std": 0.1904926896095276,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7140519618988037,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.17776569724082947,
"step": 137
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6729764138300725,
"calib/avg_num_step_conf": 2.1953125,
"calib/ece": 0.18301960784313717,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.2131687215223802,
"calib/mean_conf": 0.7287450980392157,
"calib/mu_c": 0.8048170731707318,
"calib/mu_w": 0.5916483516483516,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13431372549019602,
"calib/std_conf": 0.31718491193359705,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.601949860724234,
"calib/step_q_c_n": 359.0,
"calib/step_q_gap": 0.12815675727595804,
"calib/step_q_w": 0.4737931034482759,
"calib/step_q_w_n": 203.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1935.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 430.37890625,
"completions/mean_terminated_length": 430.37890625,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.1472,
"grad_norm": 0.0068092262372374535,
"kl": 0.06418609619140625,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0295,
"num_tokens": 31261323.0,
"reward": 0.4481315016746521,
"reward_std": 0.227791890501976,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.753614068031311,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.18391355872154236,
"step": 138
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7020090821521948,
"calib/avg_num_step_conf": 2.0625,
"calib/ece": 0.18509803921568624,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5176470588235295,
"calib/gap": 0.20241021054080055,
"calib/mean_conf": 0.7465882352941179,
"calib/mu_c": 0.8148520710059171,
"calib/mu_w": 0.6124418604651165,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13447058823529406,
"calib/std_conf": 0.3077465965770035,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6642196531791908,
"calib/step_q_c_n": 346.0,
"calib/step_q_gap": 0.15026360922314685,
"calib/step_q_w": 0.513956043956044,
"calib/step_q_w_n": 182.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1319.0,
"completions/max_terminated_length": 1319.0,
"completions/mean_length": 381.12109375,
"completions/mean_terminated_length": 382.61572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.007338706869632006,
"kl": 0.07012939453125,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0152,
"num_tokens": 31461986.0,
"reward": 0.46961668133735657,
"reward_std": 0.17519153654575348,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7622421979904175,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.15504010021686554,
"step": 139
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7917524257571302,
"calib/avg_num_step_conf": 2.23828125,
"calib/ece": 0.10454901960784307,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.31926712731549545,
"calib/mean_conf": 0.7508235294117648,
"calib/mu_c": 0.845977653631285,
"calib/mu_w": 0.5267105263157895,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07670588235294112,
"calib/std_conf": 0.3000112877538112,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.661927680798005,
"calib/step_q_c_n": 401.0,
"calib/step_q_gap": 0.22181140172823754,
"calib/step_q_w": 0.44011627906976747,
"calib/step_q_w_n": 172.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 941.0,
"completions/max_terminated_length": 941.0,
"completions/mean_length": 421.5234375,
"completions/mean_terminated_length": 423.1764831542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.0073651643469929695,
"kl": 0.06573486328125,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0359,
"num_tokens": 31674912.0,
"reward": 0.5161871910095215,
"reward_std": 0.1584603190422058,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.8248147964477539,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.12994034588336945,
"step": 140
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7992788461538464,
"calib/avg_num_step_conf": 2.08984375,
"calib/ece": 0.11972440944881882,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5826771653543307,
"calib/gap": 0.3176442307692309,
"calib/mean_conf": 0.7793307086614174,
"calib/mu_c": 0.8768750000000001,
"calib/mu_w": 0.5592307692307692,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1030708661417322,
"calib/std_conf": 0.30069071548654897,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6873584905660377,
"calib/step_q_c_n": 371.0,
"calib/step_q_gap": 0.20705361251725718,
"calib/step_q_w": 0.4803048780487805,
"calib/step_q_w_n": 164.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2909.0,
"completions/max_terminated_length": 2909.0,
"completions/mean_length": 449.04296875,
"completions/mean_terminated_length": 449.04296875,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.1504,
"grad_norm": 0.006539446301758289,
"kl": 0.061206817626953125,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0566,
"num_tokens": 31896963.0,
"reward": 0.5025902390480042,
"reward_std": 0.19069364666938782,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.814516007900238,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.1444917619228363,
"step": 141
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.77997227997228,
"calib/avg_num_step_conf": 2.16015625,
"calib/ece": 0.17862204724409453,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.3354671454671455,
"calib/mean_conf": 0.7025590551181102,
"calib/mu_c": 0.8491608391608392,
"calib/mu_w": 0.5136936936936937,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15909448818897642,
"calib/std_conf": 0.3424531499319575,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6738333333333334,
"calib/step_q_c_n": 300.0,
"calib/step_q_gap": 0.25288471673254287,
"calib/step_q_w": 0.4209486166007905,
"calib/step_q_w_n": 253.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 469.6484375,
"completions/mean_terminated_length": 469.6484375,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.006188663654029369,
"kl": 0.061065673828125,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0125,
"num_tokens": 32122353.0,
"reward": 0.4513563811779022,
"reward_std": 0.18066132068634033,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7761745452880859,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.18361811339855194,
"step": 142
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7944169232829027,
"calib/avg_num_step_conf": 2.8671875,
"calib/ece": 0.12199203187250987,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.398406374501992,
"calib/gap": 0.30510242334984594,
"calib/mean_conf": 0.6853386454183269,
"calib/mu_c": 0.8032467532467532,
"calib/mu_w": 0.49814432989690727,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.09689243027888436,
"calib/std_conf": 0.30802316759208864,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5960880829015545,
"calib/step_q_c_n": 386.0,
"calib/step_q_gap": 0.2187604966946579,
"calib/step_q_w": 0.37732758620689655,
"calib/step_q_w_n": 348.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 487.60546875,
"completions/mean_terminated_length": 493.3873596191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.006679258309304714,
"kl": 0.0638275146484375,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0334,
"num_tokens": 32354516.0,
"reward": 0.4849836230278015,
"reward_std": 0.16762301325798035,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.787904679775238,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.13356241583824158,
"step": 143
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7010775862068965,
"calib/avg_num_step_conf": 2.0703125,
"calib/ece": 0.16389370078740156,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5393700787401575,
"calib/gap": 0.2521537356321839,
"calib/mean_conf": 0.7216102362204725,
"calib/mu_c": 0.8010287356321839,
"calib/mu_w": 0.548875,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10023228346456695,
"calib/std_conf": 0.3310463155584519,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6510911854103344,
"calib/step_q_c_n": 329.0,
"calib/step_q_gap": 0.16780760332078215,
"calib/step_q_w": 0.48328358208955224,
"calib/step_q_w_n": 201.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2259.0,
"completions/max_terminated_length": 2259.0,
"completions/mean_length": 442.734375,
"completions/mean_terminated_length": 442.734375,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.1536,
"grad_norm": 0.007385350298136473,
"kl": 0.06850814819335938,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0029,
"num_tokens": 32571984.0,
"reward": 0.4696730971336365,
"reward_std": 0.19212749600410461,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7683699131011963,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.16261744499206543,
"step": 144
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6778014688462449,
"calib/avg_num_step_conf": 2.3671875,
"calib/ece": 0.15023437500000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.46484375,
"calib/gap": 0.2027323698965492,
"calib/mean_conf": 0.739375,
"calib/mu_c": 0.7924338624338626,
"calib/mu_w": 0.5897014925373134,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07566406250000002,
"calib/std_conf": 0.29674865732636435,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.638768472906404,
"calib/step_q_c_n": 406.0,
"calib/step_q_gap": 0.1725684729064041,
"calib/step_q_w": 0.46619999999999995,
"calib/step_q_w_n": 200.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1073.0,
"completions/max_terminated_length": 1073.0,
"completions/mean_length": 393.6796875,
"completions/mean_terminated_length": 395.2235412597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.007237947080284357,
"kl": 0.06774139404296875,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0243,
"num_tokens": 32775470.0,
"reward": 0.5008053779602051,
"reward_std": 0.16850511729717255,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7931559085845947,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.13842013478279114,
"step": 145
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7339015151515151,
"calib/avg_num_step_conf": 2.37890625,
"calib/ece": 0.24218750000000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4921875,
"calib/gap": 0.2860361681329424,
"calib/mean_conf": 0.7196093750000001,
"calib/mu_c": 0.8670967741935485,
"calib/mu_w": 0.581060606060606,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23871093750000005,
"calib/std_conf": 0.3211257112286548,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7208627450980393,
"calib/step_q_c_n": 255.0,
"calib/step_q_gap": 0.29100398803589234,
"calib/step_q_w": 0.42985875706214693,
"calib/step_q_w_n": 354.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1481.0,
"completions/max_terminated_length": 1481.0,
"completions/mean_length": 444.40234375,
"completions/mean_terminated_length": 446.1451110839844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.007386982906609774,
"kl": 0.0638885498046875,
"learning_rate": 1.5e-06,
"loss": 0.0116,
"num_tokens": 32996453.0,
"reward": 0.40284696221351624,
"reward_std": 0.22834259271621704,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.719482421875,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.20753851532936096,
"step": 146
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.755403548225887,
"calib/avg_num_step_conf": 2.3203125,
"calib/ece": 0.22779527559055124,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5078740157480315,
"calib/gap": 0.25840329835082454,
"calib/mean_conf": 0.6967716535433072,
"calib/mu_c": 0.8147826086956522,
"calib/mu_w": 0.5563793103448277,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1906299212598426,
"calib/std_conf": 0.3387782167456465,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6112996389891696,
"calib/step_q_c_n": 277.0,
"calib/step_q_gap": 0.13502203646551025,
"calib/step_q_w": 0.47627760252365936,
"calib/step_q_w_n": 317.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2760.0,
"completions/max_terminated_length": 2760.0,
"completions/mean_length": 452.94921875,
"completions/mean_terminated_length": 454.72552490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.1568,
"grad_norm": 0.006861098576337099,
"kl": 0.063690185546875,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0086,
"num_tokens": 33216088.0,
"reward": 0.42057937383651733,
"reward_std": 0.18396975100040436,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7294925451278687,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.19302131235599518,
"step": 147
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.761721666417799,
"calib/avg_num_step_conf": 2.10546875,
"calib/ece": 0.14207843137254897,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.3064976855308348,
"calib/mean_conf": 0.7328235294117648,
"calib/mu_c": 0.8217679558011051,
"calib/mu_w": 0.5152702702702703,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08254901960784308,
"calib/std_conf": 0.3262848933113967,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6500506329113923,
"calib/step_q_c_n": 395.0,
"calib/step_q_gap": 0.11220341068917006,
"calib/step_q_w": 0.5378472222222223,
"calib/step_q_w_n": 144.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1208.0,
"completions/max_terminated_length": 1208.0,
"completions/mean_length": 408.74609375,
"completions/mean_terminated_length": 410.3490295410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.008046685717999935,
"kl": 0.06903076171875,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0241,
"num_tokens": 33425839.0,
"reward": 0.4869382381439209,
"reward_std": 0.1796838790178299,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7993249893188477,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.16372980177402496,
"step": 148
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8624321530111139,
"calib/avg_num_step_conf": 2.25,
"calib/ece": 0.1641984126984126,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5357142857142857,
"calib/gap": 0.4337823727061256,
"calib/mean_conf": 0.7339126984126986,
"calib/mu_c": 0.916376712328767,
"calib/mu_w": 0.4825943396226415,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15937301587301578,
"calib/std_conf": 0.3253357733086017,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7625072992700731,
"calib/step_q_c_n": 274.0,
"calib/step_q_gap": 0.35601723304490757,
"calib/step_q_w": 0.40649006622516554,
"calib/step_q_w_n": 302.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1982.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 483.5234375,
"completions/mean_terminated_length": 487.3307189941406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.00647739227861166,
"kl": 0.05419921875,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0281,
"num_tokens": 33654077.0,
"reward": 0.49852481484413147,
"reward_std": 0.21163997054100037,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.8149806261062622,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.12652483582496643,
"step": 149
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7663901525287664,
"calib/avg_num_step_conf": 2.4375,
"calib/ece": 0.20080321285140562,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5783132530120482,
"calib/gap": 0.28906609579876885,
"calib/mean_conf": 0.7951807228915664,
"calib/mu_c": 0.9124324324324323,
"calib/mu_w": 0.6233663366336635,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.20080321285140562,
"calib/std_conf": 0.2744005285529898,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6704747774480712,
"calib/step_q_c_n": 337.0,
"calib/step_q_gap": 0.17747826176862863,
"calib/step_q_w": 0.4929965156794425,
"calib/step_q_w_n": 287.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2682.0,
"completions/max_terminated_length": 2682.0,
"completions/mean_length": 441.90234375,
"completions/mean_terminated_length": 443.63531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.16,
"grad_norm": 0.007934695109724998,
"kl": 0.07119369506835938,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0276,
"num_tokens": 33872164.0,
"reward": 0.4670780897140503,
"reward_std": 0.20850981771945953,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7612718343734741,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.1372719407081604,
"step": 150
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7397745571658615,
"calib/avg_num_step_conf": 1.859375,
"calib/ece": 0.2719200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.512,
"calib/gap": 0.2768953301127214,
"calib/mean_conf": 0.70352,
"calib/mu_c": 0.8530434782608696,
"calib/mu_w": 0.5761481481481482,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2577200000000001,
"calib/std_conf": 0.3358660590175792,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7825128205128207,
"calib/step_q_c_n": 195.0,
"calib/step_q_gap": 0.28119609453417294,
"calib/step_q_w": 0.5013167259786477,
"calib/step_q_w_n": 281.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 487.125,
"completions/mean_terminated_length": 490.96063232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.0062635913491249084,
"kl": 0.052555084228515625,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0214,
"num_tokens": 34103892.0,
"reward": 0.3825833797454834,
"reward_std": 0.186384379863739,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6929078102111816,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.21133476495742798,
"step": 151
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.761882008154944,
"calib/avg_num_step_conf": 2.2734375,
"calib/ece": 0.15128853754940716,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4189723320158103,
"calib/gap": 0.3069249490316004,
"calib/mean_conf": 0.698086956521739,
"calib/mu_c": 0.8303194444444445,
"calib/mu_w": 0.5233944954128441,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.14010276679841902,
"calib/std_conf": 0.3178229059605173,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6928536585365853,
"calib/step_q_c_n": 328.0,
"calib/step_q_gap": 0.20253869790666407,
"calib/step_q_w": 0.49031496062992125,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2254.0,
"completions/max_terminated_length": 2254.0,
"completions/mean_length": 440.63671875,
"completions/mean_terminated_length": 442.36474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.0070022461004555225,
"kl": 0.06756591796875,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0227,
"num_tokens": 34322087.0,
"reward": 0.45159396529197693,
"reward_std": 0.19584518671035767,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7669804096221924,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.1716049760580063,
"step": 152
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7111842105263158,
"calib/avg_num_step_conf": 2.125,
"calib/ece": 0.21464285714285697,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5753968253968254,
"calib/gap": 0.206663157894737,
"calib/mean_conf": 0.7907539682539684,
"calib/mu_c": 0.872763157894737,
"calib/mu_w": 0.6661,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20111111111111096,
"calib/std_conf": 0.2823003350173705,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7380727272727273,
"calib/step_q_c_n": 275.0,
"calib/step_q_gap": 0.3008980060831362,
"calib/step_q_w": 0.4371747211895911,
"calib/step_q_w_n": 269.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2211.0,
"completions/max_terminated_length": 2211.0,
"completions/mean_length": 464.30078125,
"completions/mean_terminated_length": 467.9566955566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1632,
"grad_norm": 0.006464678328484297,
"kl": 0.057842254638671875,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0094,
"num_tokens": 34548268.0,
"reward": 0.4233105182647705,
"reward_std": 0.19462406635284424,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7310148477554321,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.1992376148700714,
"step": 153
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7771455223880597,
"calib/avg_num_step_conf": 1.8671875,
"calib/ece": 0.2810236220472442,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5039370078740157,
"calib/gap": 0.2784477611940299,
"calib/mean_conf": 0.7311023622047245,
"calib/mu_c": 0.8780000000000001,
"calib/mu_w": 0.5995522388059702,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2698425196850395,
"calib/std_conf": 0.3175688364747583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7962146892655367,
"calib/step_q_c_n": 177.0,
"calib/step_q_gap": 0.30312498826885903,
"calib/step_q_w": 0.4930897009966777,
"calib/step_q_w_n": 301.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 985.0,
"completions/max_terminated_length": 985.0,
"completions/mean_length": 407.375,
"completions/mean_terminated_length": 410.5826721191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.007672094739973545,
"kl": 0.06097412109375,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0456,
"num_tokens": 34756996.0,
"reward": 0.3938148617744446,
"reward_std": 0.19781461358070374,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7103534936904907,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.21334879100322723,
"step": 154
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6745472837022134,
"calib/avg_num_step_conf": 2.25,
"calib/ece": 0.21767716535433076,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.49606299212598426,
"calib/gap": 0.1960110663983904,
"calib/mean_conf": 0.7527952755905514,
"calib/mu_c": 0.839225352112676,
"calib/mu_w": 0.6432142857142856,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20570866141732289,
"calib/std_conf": 0.296939211111955,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6303960396039605,
"calib/step_q_c_n": 303.0,
"calib/step_q_gap": 0.09116527037319133,
"calib/step_q_w": 0.5392307692307692,
"calib/step_q_w_n": 273.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2539.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 405.46484375,
"completions/mean_terminated_length": 407.054931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.008317645639181137,
"kl": 0.066864013671875,
"learning_rate": 1.25e-06,
"loss": -0.0029,
"num_tokens": 34968011.0,
"reward": 0.3980780243873596,
"reward_std": 0.18083029985427856,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.718758225440979,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.23197712004184723,
"step": 155
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6635432283858071,
"calib/avg_num_step_conf": 1.953125,
"calib/ece": 0.2611417322834647,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.531496062992126,
"calib/gap": 0.16840204897551225,
"calib/mean_conf": 0.7310629921259844,
"calib/mu_c": 0.8079710144927537,
"calib/mu_w": 0.6395689655172414,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22444881889763796,
"calib/std_conf": 0.32045005870881077,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6607053941908715,
"calib/step_q_c_n": 241.0,
"calib/step_q_gap": 0.1683540428395201,
"calib/step_q_w": 0.4923513513513514,
"calib/step_q_w_n": 259.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2500.0,
"completions/max_terminated_length": 2500.0,
"completions/mean_length": 427.13671875,
"completions/mean_terminated_length": 427.13671875,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.1664,
"grad_norm": 0.006734688300639391,
"kl": 0.062412261962890625,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0044,
"num_tokens": 35182118.0,
"reward": 0.3882417678833008,
"reward_std": 0.21631193161010742,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6920551061630249,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.22182153165340424,
"step": 156
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7651593364994149,
"calib/avg_num_step_conf": 2.2578125,
"calib/ece": 0.1635826771653544,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5236220472440944,
"calib/gap": 0.295426388602106,
"calib/mean_conf": 0.7294094488188977,
"calib/mu_c": 0.8305988023952094,
"calib/mu_w": 0.5351724137931034,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11775590551181109,
"calib/std_conf": 0.3333887767249415,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6801794871794872,
"calib/step_q_c_n": 390.0,
"calib/step_q_gap": 0.20778587015821054,
"calib/step_q_w": 0.47239361702127664,
"calib/step_q_w_n": 188.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2433.0,
"completions/max_terminated_length": 2433.0,
"completions/mean_length": 423.82421875,
"completions/mean_terminated_length": 423.82421875,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.0070098452270030975,
"kl": 0.06569671630859375,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0197,
"num_tokens": 35394345.0,
"reward": 0.47161611914634705,
"reward_std": 0.17706799507141113,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7853542566299438,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.17180952429771423,
"step": 157
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6336452940395592,
"calib/avg_num_step_conf": 2.0390625,
"calib/ece": 0.22211764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5686274509803921,
"calib/gap": 0.17093787335722854,
"calib/mean_conf": 0.7811764705882352,
"calib/mu_c": 0.8435185185185188,
"calib/mu_w": 0.6725806451612902,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.184,
"calib/std_conf": 0.29557950430964497,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7130952380952381,
"calib/step_q_c_n": 336.0,
"calib/step_q_gap": 0.14083717357910908,
"calib/step_q_w": 0.572258064516129,
"calib/step_q_w_n": 186.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1364.0,
"completions/max_terminated_length": 1364.0,
"completions/mean_length": 428.8984375,
"completions/mean_terminated_length": 430.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.007019939366728067,
"kl": 0.06101226806640625,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0395,
"num_tokens": 35609383.0,
"reward": 0.4132624864578247,
"reward_std": 0.20368565618991852,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7299218773841858,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.22761566936969757,
"step": 158
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7862674238513164,
"calib/avg_num_step_conf": 2.00390625,
"calib/ece": 0.1817391304347824,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5731225296442688,
"calib/gap": 0.307683273102736,
"calib/mean_conf": 0.7633201581027668,
"calib/mu_c": 0.8897986577181207,
"calib/mu_w": 0.5821153846153847,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17806324110671917,
"calib/std_conf": 0.30821307671940945,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7772161172161173,
"calib/step_q_c_n": 273.0,
"calib/step_q_gap": 0.2800494505494506,
"calib/step_q_w": 0.4971666666666667,
"calib/step_q_w_n": 240.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 438.69921875,
"completions/mean_terminated_length": 438.69921875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1696,
"grad_norm": 0.007189049851149321,
"kl": 0.07638168334960938,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0311,
"num_tokens": 35826474.0,
"reward": 0.4793606400489807,
"reward_std": 0.15734824538230896,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7705523371696472,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.12511231005191803,
"step": 159
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7946846341006926,
"calib/avg_num_step_conf": 2.0625,
"calib/ece": 0.19279527559055118,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4763779527559055,
"calib/gap": 0.326921829184603,
"calib/mean_conf": 0.7164173228346458,
"calib/mu_c": 0.8670072992700731,
"calib/mu_w": 0.5400854700854701,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18492125984251967,
"calib/std_conf": 0.32399596432173317,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7056153846153845,
"calib/step_q_c_n": 260.0,
"calib/step_q_gap": 0.22916016073478745,
"calib/step_q_w": 0.4764552238805971,
"calib/step_q_w_n": 268.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2614.0,
"completions/max_terminated_length": 2614.0,
"completions/mean_length": 454.5,
"completions/mean_terminated_length": 454.5,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.007117413450032473,
"kl": 0.05873870849609375,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0267,
"num_tokens": 36047666.0,
"reward": 0.46024322509765625,
"reward_std": 0.1503116339445114,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7683855295181274,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.15258657932281494,
"step": 160
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6931727490386709,
"calib/avg_num_step_conf": 2.1015625,
"calib/ece": 0.16796875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.4453125,
"calib/gap": 0.24856997750852516,
"calib/mean_conf": 0.6948437500000001,
"calib/mu_c": 0.7696089385474861,
"calib/mu_w": 0.521038961038961,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08179687499999999,
"calib/std_conf": 0.3302401691737961,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6068948655256724,
"calib/step_q_c_n": 409.0,
"calib/step_q_gap": 0.11526695854892827,
"calib/step_q_w": 0.49162790697674413,
"calib/step_q_w_n": 129.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1155.0,
"completions/max_terminated_length": 1155.0,
"completions/mean_length": 398.4296875,
"completions/mean_terminated_length": 399.9921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.007734321523457766,
"kl": 0.0670166015625,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0105,
"num_tokens": 36253584.0,
"reward": 0.4745808243751526,
"reward_std": 0.15935248136520386,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7851648330688477,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.17584697902202606,
"step": 161
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7935420743639922,
"calib/avg_num_step_conf": 1.83203125,
"calib/ece": 0.10458823529411768,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.3656977269306033,
"calib/mean_conf": 0.7849803921568628,
"calib/mu_c": 0.8896703296703294,
"calib/mu_w": 0.5239726027397261,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.087921568627451,
"calib/std_conf": 0.2960363974837221,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7499999999999999,
"calib/step_q_c_n": 341.0,
"calib/step_q_gap": 0.22757812499999985,
"calib/step_q_w": 0.522421875,
"calib/step_q_w_n": 128.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1374.0,
"completions/max_terminated_length": 1374.0,
"completions/mean_length": 410.00390625,
"completions/mean_terminated_length": 411.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1728,
"grad_norm": 0.007531987503170967,
"kl": 0.06168365478515625,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0069,
"num_tokens": 36462689.0,
"reward": 0.5377246141433716,
"reward_std": 0.17636175453662872,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.8490738272666931,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.11503089964389801,
"step": 162
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.783248001998002,
"calib/avg_num_step_conf": 2.07421875,
"calib/ece": 0.16941176470588232,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4823529411764706,
"calib/gap": 0.35326111388611403,
"calib/mean_conf": 0.7001568627450981,
"calib/mu_c": 0.8553146853146855,
"calib/mu_w": 0.5020535714285714,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15439215686274504,
"calib/std_conf": 0.3368155003506106,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.773085501858736,
"calib/step_q_c_n": 269.0,
"calib/step_q_gap": 0.3311389369732399,
"calib/step_q_w": 0.44194656488549616,
"calib/step_q_w_n": 262.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1269.0,
"completions/max_terminated_length": 1269.0,
"completions/mean_length": 428.6875,
"completions/mean_terminated_length": 430.36865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.0073830727487802505,
"kl": 0.06630706787109375,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0014,
"num_tokens": 36677265.0,
"reward": 0.4658210873603821,
"reward_std": 0.189020574092865,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7848738431930542,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.16338786482810974,
"step": 163
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.774967824967825,
"calib/avg_num_step_conf": 1.92578125,
"calib/ece": 0.1702292490118577,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.43873517786561267,
"calib/gap": 0.3019448519948519,
"calib/mean_conf": 0.7031936758893281,
"calib/mu_c": 0.8285067567567567,
"calib/mu_w": 0.5265619047619048,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.14422134387351776,
"calib/std_conf": 0.3259338757223692,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6631146953405018,
"calib/step_q_c_n": 279.0,
"calib/step_q_gap": 0.1753670317890999,
"calib/step_q_w": 0.4877476635514019,
"calib/step_q_w_n": 214.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2330.0,
"completions/max_terminated_length": 2330.0,
"completions/mean_length": 471.390625,
"completions/mean_terminated_length": 471.390625,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.006834862753748894,
"kl": 0.0714569091796875,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0036,
"num_tokens": 36904077.0,
"reward": 0.4619291424751282,
"reward_std": 0.17501939833164215,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.771399199962616,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.1600409299135208,
"step": 164
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6768137528703532,
"calib/avg_num_step_conf": 2.203125,
"calib/ece": 0.28952755905511807,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5511811023622047,
"calib/gap": 0.1804319493576616,
"calib/mean_conf": 0.745984251968504,
"calib/mu_c": 0.8333587786259543,
"calib/mu_w": 0.6529268292682927,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25988188976377946,
"calib/std_conf": 0.3173022599375055,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7048421052631578,
"calib/step_q_c_n": 285.0,
"calib/step_q_gap": 0.14688511601584608,
"calib/step_q_w": 0.5579569892473117,
"calib/step_q_w_n": 279.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 481.8046875,
"completions/mean_terminated_length": 483.69415283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.176,
"grad_norm": 0.007147556636482477,
"kl": 0.05928802490234375,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0406,
"num_tokens": 37132995.0,
"reward": 0.36501526832580566,
"reward_std": 0.20879912376403809,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6746468544006348,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.24383507668972015,
"step": 165
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8364864864864865,
"calib/avg_num_step_conf": 1.9453125,
"calib/ece": 0.10988188976377947,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5275590551181102,
"calib/gap": 0.39777327327327316,
"calib/mean_conf": 0.719724409448819,
"calib/mu_c": 0.835611111111111,
"calib/mu_w": 0.4378378378378378,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.060472440944881814,
"calib/std_conf": 0.3372893376289778,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7329824561403508,
"calib/step_q_c_n": 342.0,
"calib/step_q_gap": 0.21009784075573545,
"calib/step_q_w": 0.5228846153846154,
"calib/step_q_w_n": 156.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2609.0,
"completions/max_terminated_length": 2609.0,
"completions/mean_length": 483.12109375,
"completions/mean_terminated_length": 483.12109375,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.006450401619076729,
"kl": 0.06050872802734375,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0376,
"num_tokens": 37362858.0,
"reward": 0.5276123881340027,
"reward_std": 0.15655210614204407,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.8373090028762817,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.12114673852920532,
"step": 166
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6948282097649187,
"calib/avg_num_step_conf": 1.73046875,
"calib/ece": 0.17807086614173234,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6496062992125984,
"calib/gap": 0.23772224231464734,
"calib/mean_conf": 0.8042913385826772,
"calib/mu_c": 0.8782285714285715,
"calib/mu_w": 0.6405063291139241,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14669291338582682,
"calib/std_conf": 0.29233126528843495,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7734219269102991,
"calib/step_q_c_n": 301.0,
"calib/step_q_gap": 0.19468953254410182,
"calib/step_q_w": 0.5787323943661973,
"calib/step_q_w_n": 142.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1067.0,
"completions/max_terminated_length": 1067.0,
"completions/mean_length": 424.11328125,
"completions/mean_terminated_length": 425.7764892578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.006349243223667145,
"kl": 0.05992889404296875,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0187,
"num_tokens": 37577039.0,
"reward": 0.46977323293685913,
"reward_std": 0.15033377707004547,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7826762199401855,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.17828592658042908,
"step": 167
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8598246094820499,
"calib/avg_num_step_conf": 2.22265625,
"calib/ece": 0.12833992094861651,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5612648221343873,
"calib/gap": 0.41263565360372706,
"calib/mean_conf": 0.7647826086956522,
"calib/mu_c": 0.909939024390244,
"calib/mu_w": 0.49730337078651693,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.12245059288537541,
"calib/std_conf": 0.30643502263408473,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.733914373088685,
"calib/step_q_c_n": 327.0,
"calib/step_q_gap": 0.3189970177167842,
"calib/step_q_w": 0.41491735537190083,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 492.76953125,
"completions/mean_terminated_length": 494.7019958496094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.1792,
"grad_norm": 0.0063049341551959515,
"kl": 0.05417633056640625,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0112,
"num_tokens": 37807860.0,
"reward": 0.5274177193641663,
"reward_std": 0.16986939311027527,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.8387695550918579,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.10893408954143524,
"step": 168
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7536912751677853,
"calib/avg_num_step_conf": 1.78515625,
"calib/ece": 0.2051181102362205,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5708661417322834,
"calib/gap": 0.26666538830297226,
"calib/mean_conf": 0.7738582677165354,
"calib/mu_c": 0.8840939597315437,
"calib/mu_w": 0.6174285714285714,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19618110236220476,
"calib/std_conf": 0.3029670787034485,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8015637860082304,
"calib/step_q_c_n": 243.0,
"calib/step_q_gap": 0.2658161224568285,
"calib/step_q_w": 0.5357476635514019,
"calib/step_q_w_n": 214.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1537.0,
"completions/max_terminated_length": 1537.0,
"completions/mean_length": 442.9921875,
"completions/mean_terminated_length": 444.72943115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.006920646410435438,
"kl": 0.0618438720703125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0157,
"num_tokens": 38025450.0,
"reward": 0.46007660031318665,
"reward_std": 0.15827137231826782,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7540468573570251,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.1495186686515808,
"step": 169
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7508914754760143,
"calib/avg_num_step_conf": 2.27734375,
"calib/ece": 0.1843749999999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.57421875,
"calib/gap": 0.25116329139473836,
"calib/mean_conf": 0.7796875000000001,
"calib/mu_c": 0.8670059880239519,
"calib/mu_w": 0.6158426966292135,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1558593749999999,
"calib/std_conf": 0.29746569523854344,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7223076923076923,
"calib/step_q_c_n": 325.0,
"calib/step_q_gap": 0.272850327966607,
"calib/step_q_w": 0.4494573643410853,
"calib/step_q_w_n": 258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1276.0,
"completions/max_terminated_length": 1276.0,
"completions/mean_length": 462.48046875,
"completions/mean_terminated_length": 464.2941589355469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.006413524504750967,
"kl": 0.06085205078125,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0082,
"num_tokens": 38247997.0,
"reward": 0.4750288128852844,
"reward_std": 0.20399650931358337,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7785238027572632,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.15815365314483643,
"step": 170
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6820796733885933,
"calib/avg_num_step_conf": 2.30859375,
"calib/ece": 0.2445098039215686,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.47843137254901963,
"calib/gap": 0.19062786094271944,
"calib/mean_conf": 0.6989411764705883,
"calib/mu_c": 0.7871532846715329,
"calib/mu_w": 0.5965254237288135,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20309803921568625,
"calib/std_conf": 0.33759248038767425,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6557492354740061,
"calib/step_q_c_n": 327.0,
"calib/step_q_gap": 0.1330977203224909,
"calib/step_q_w": 0.5226515151515152,
"calib/step_q_w_n": 264.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2336.0,
"completions/max_terminated_length": 2336.0,
"completions/mean_length": 442.7109375,
"completions/mean_terminated_length": 442.7109375,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.1824,
"grad_norm": 0.007600404787808657,
"kl": 0.068572998046875,
"learning_rate": 8.055555555555557e-07,
"loss": 0.0208,
"num_tokens": 38468227.0,
"reward": 0.39803436398506165,
"reward_std": 0.20574426651000977,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7018972635269165,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.21129731833934784,
"step": 171
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7579238754325259,
"calib/avg_num_step_conf": 2.13671875,
"calib/ece": 0.18980392156862735,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.611764705882353,
"calib/gap": 0.21288235294117652,
"calib/mean_conf": 0.8309803921568629,
"calib/mu_c": 0.9019411764705884,
"calib/mu_w": 0.6890588235294118,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.17705882352941169,
"calib/std_conf": 0.2571003840111285,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7507894736842107,
"calib/step_q_c_n": 342.0,
"calib/step_q_gap": 0.21293581514762527,
"calib/step_q_w": 0.5378536585365854,
"calib/step_q_w_n": 205.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 443.11328125,
"completions/mean_terminated_length": 444.85101318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.006095860619097948,
"kl": 0.0571746826171875,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0012,
"num_tokens": 38685016.0,
"reward": 0.4755558669567108,
"reward_std": 0.1840173453092575,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7747269868850708,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.15408393740653992,
"step": 172
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6584417372152248,
"calib/avg_num_step_conf": 2.12109375,
"calib/ece": 0.24011811023622043,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6732283464566929,
"calib/gap": 0.12509532658820277,
"calib/mean_conf": 0.8352362204724411,
"calib/mu_c": 0.8780838323353293,
"calib/mu_w": 0.7529885057471265,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2089370078740157,
"calib/std_conf": 0.2684421067424582,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7461980830670927,
"calib/step_q_c_n": 313.0,
"calib/step_q_gap": 0.10011112654535348,
"calib/step_q_w": 0.6460869565217392,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2343.0,
"completions/max_terminated_length": 2343.0,
"completions/mean_length": 496.71484375,
"completions/mean_terminated_length": 496.71484375,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.00640326039865613,
"kl": 0.058563232421875,
"learning_rate": 7.5e-07,
"loss": -0.0174,
"num_tokens": 38915335.0,
"reward": 0.43277373909950256,
"reward_std": 0.16515189409255981,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7180511355400085,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.180628702044487,
"step": 173
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6305704099821746,
"calib/avg_num_step_conf": 2.37109375,
"calib/ece": 0.290398406374502,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4940239043824701,
"calib/gap": 0.13328749681690877,
"calib/mean_conf": 0.7128685258964144,
"calib/mu_c": 0.776060606060606,
"calib/mu_w": 0.6427731092436972,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23868525896414336,
"calib/std_conf": 0.3249709436067534,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6553260869565217,
"calib/step_q_c_n": 276.0,
"calib/step_q_gap": 0.1292445159595429,
"calib/step_q_w": 0.5260815709969788,
"calib/step_q_w_n": 331.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 544.0703125,
"completions/mean_terminated_length": 546.2039794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.1856,
"grad_norm": 0.0059709707275033,
"kl": 0.05558013916015625,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0299,
"num_tokens": 39158849.0,
"reward": 0.3485894203186035,
"reward_std": 0.2507731020450592,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6534445285797119,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.2531406879425049,
"step": 174
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7570719602977668,
"calib/avg_num_step_conf": 2.0078125,
"calib/ece": 0.22905511811023632,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.301862282878412,
"calib/mean_conf": 0.6925196850393701,
"calib/mu_c": 0.8470161290322582,
"calib/mu_w": 0.5451538461538462,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21669291338582686,
"calib/std_conf": 0.3442119117109115,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6973622047244095,
"calib/step_q_c_n": 254.0,
"calib/step_q_gap": 0.19455451241671706,
"calib/step_q_w": 0.5028076923076924,
"calib/step_q_w_n": 260.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 479.45703125,
"completions/mean_terminated_length": 481.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.006477923132479191,
"kl": 0.056880950927734375,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0068,
"num_tokens": 39387414.0,
"reward": 0.4128992557525635,
"reward_std": 0.1991274058818817,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7348886728286743,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.20362144708633423,
"step": 175
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7421730555897942,
"calib/avg_num_step_conf": 2.3203125,
"calib/ece": 0.2615294117647059,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5450980392156862,
"calib/gap": 0.2454147664242574,
"calib/mean_conf": 0.7598039215686274,
"calib/mu_c": 0.877218045112782,
"calib/mu_w": 0.6318032786885246,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24988235294117647,
"calib/std_conf": 0.3098177212116475,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7332653061224491,
"calib/step_q_c_n": 294.0,
"calib/step_q_gap": 0.20066530612244915,
"calib/step_q_w": 0.5326,
"calib/step_q_w_n": 300.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1681.0,
"completions/max_terminated_length": 1681.0,
"completions/mean_length": 448.6484375,
"completions/mean_terminated_length": 450.4078674316406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.009094655513763428,
"kl": 0.09540557861328125,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0078,
"num_tokens": 39606332.0,
"reward": 0.41096848249435425,
"reward_std": 0.2125389277935028,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7137258052825928,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.19413259625434875,
"step": 176
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7259706802429001,
"calib/avg_num_step_conf": 2.90625,
"calib/ece": 0.239375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5546875,
"calib/gap": 0.25054775194749435,
"calib/mean_conf": 0.75046875,
"calib/mu_c": 0.8669343065693431,
"calib/mu_w": 0.6163865546218488,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22734375,
"calib/std_conf": 0.31732120796353575,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6893696275071635,
"calib/step_q_c_n": 349.0,
"calib/step_q_gap": 0.21255950092488496,
"calib/step_q_w": 0.4768101265822785,
"calib/step_q_w_n": 395.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1354.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 480.25,
"completions/mean_terminated_length": 482.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1888,
"grad_norm": 0.006366265006363392,
"kl": 0.05780792236328125,
"learning_rate": 6.388888888888889e-07,
"loss": 0.0256,
"num_tokens": 39833108.0,
"reward": 0.42046934366226196,
"reward_std": 0.19948306679725647,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7267382740974426,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.19204957783222198,
"step": 177
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7904965404965405,
"calib/avg_num_step_conf": 2.52734375,
"calib/ece": 0.19743083003952552,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6284584980237155,
"calib/gap": 0.2697049247049248,
"calib/mean_conf": 0.822806324110672,
"calib/mu_c": 0.9198148148148148,
"calib/mu_w": 0.65010989010989,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18996047430830024,
"calib/std_conf": 0.26390880199470856,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7512957746478874,
"calib/step_q_c_n": 355.0,
"calib/step_q_gap": 0.3298916650588464,
"calib/step_q_w": 0.42140410958904106,
"calib/step_q_w_n": 292.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3064.0,
"completions/max_terminated_length": 3064.0,
"completions/mean_length": 452.44140625,
"completions/mean_terminated_length": 454.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.006662712432444096,
"kl": 0.057811737060546875,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0158,
"num_tokens": 40055005.0,
"reward": 0.48111921548843384,
"reward_std": 0.16879983246326447,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7817012071609497,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.14368149638175964,
"step": 178
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7012151996399408,
"calib/avg_num_step_conf": 2.328125,
"calib/ece": 0.2307086614173229,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5551181102362205,
"calib/gap": 0.20123834629974946,
"calib/mean_conf": 0.784488188976378,
"calib/mu_c": 0.8660927152317881,
"calib/mu_w": 0.6648543689320386,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2103543307086615,
"calib/std_conf": 0.29155761079361453,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7528892307692308,
"calib/step_q_c_n": 325.0,
"calib/step_q_gap": 0.22488185069542999,
"calib/step_q_w": 0.5280073800738008,
"calib/step_q_w_n": 271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2499.0,
"completions/max_terminated_length": 2499.0,
"completions/mean_length": 475.6171875,
"completions/mean_terminated_length": 475.6171875,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.006497078109532595,
"kl": 0.06073760986328125,
"learning_rate": 5.833333333333334e-07,
"loss": 0.003,
"num_tokens": 40283027.0,
"reward": 0.4224216341972351,
"reward_std": 0.23153510689735413,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7222155928611755,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.1914348155260086,
"step": 179
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7532517957678121,
"calib/avg_num_step_conf": 1.9765625,
"calib/ece": 0.18440944881889765,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.547244094488189,
"calib/gap": 0.29907461334368723,
"calib/mean_conf": 0.7678740157480316,
"calib/mu_c": 0.886797385620915,
"calib/mu_w": 0.5877227722772278,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17496062992125982,
"calib/std_conf": 0.30078409323323846,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7633684210526317,
"calib/step_q_c_n": 285.0,
"calib/step_q_gap": 0.18762181471779005,
"calib/step_q_w": 0.5757466063348416,
"calib/step_q_w_n": 221.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2583.0,
"completions/max_terminated_length": 2583.0,
"completions/mean_length": 529.6796875,
"completions/mean_terminated_length": 529.6796875,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.192,
"grad_norm": 0.006103991065174341,
"kl": 0.0552825927734375,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0129,
"num_tokens": 40522481.0,
"reward": 0.47175779938697815,
"reward_std": 0.21102026104927063,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.769753098487854,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.14186255633831024,
"step": 180
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7538148843026892,
"calib/avg_num_step_conf": 2.578125,
"calib/ece": 0.2626086956521738,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.47035573122529645,
"calib/gap": 0.23111757348342743,
"calib/mean_conf": 0.7637154150197629,
"calib/mu_c": 0.8760769230769232,
"calib/mu_w": 0.6449593495934958,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25624505928853747,
"calib/std_conf": 0.2807662265161187,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7090378006872852,
"calib/step_q_c_n": 291.0,
"calib/step_q_gap": 0.18971530746235288,
"calib/step_q_w": 0.5193224932249323,
"calib/step_q_w_n": 369.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1245.0,
"completions/max_terminated_length": 1245.0,
"completions/mean_length": 433.6015625,
"completions/mean_terminated_length": 437.0157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.007144651375710964,
"kl": 0.07428741455078125,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0021,
"num_tokens": 40739747.0,
"reward": 0.40516453981399536,
"reward_std": 0.20815244317054749,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7120101451873779,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.20011860132217407,
"step": 181
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7901029787822241,
"calib/avg_num_step_conf": 2.1328125,
"calib/ece": 0.17932000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.64,
"calib/gap": 0.30472942152187443,
"calib/mean_conf": 0.8017199999999999,
"calib/mu_c": 0.9126415094339624,
"calib/mu_w": 0.607912087912088,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17252000000000006,
"calib/std_conf": 0.2873253236315936,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7957911392405064,
"calib/step_q_c_n": 316.0,
"calib/step_q_gap": 0.2641824435883324,
"calib/step_q_w": 0.531608695652174,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 484.37109375,
"completions/mean_terminated_length": 490.1146545410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.005775344092398882,
"kl": 0.050815582275390625,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0152,
"num_tokens": 40969906.0,
"reward": 0.4684050679206848,
"reward_std": 0.21433956921100616,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7808293104171753,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.16355042159557343,
"step": 182
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6680239898989898,
"calib/avg_num_step_conf": 1.87890625,
"calib/ece": 0.2461811023622047,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5433070866141733,
"calib/gap": 0.20218939393939417,
"calib/mean_conf": 0.7603543307086614,
"calib/mu_c": 0.8479166666666668,
"calib/mu_w": 0.6457272727272726,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21980314960629915,
"calib/std_conf": 0.31335665643925337,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7191319444444445,
"calib/step_q_c_n": 288.0,
"calib/step_q_gap": 0.09586769573978127,
"calib/step_q_w": 0.6232642487046632,
"calib/step_q_w_n": 193.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1571.0,
"completions/max_terminated_length": 1571.0,
"completions/mean_length": 467.65234375,
"completions/mean_terminated_length": 469.4862976074219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1952,
"grad_norm": 0.006125408224761486,
"kl": 0.05516815185546875,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0079,
"num_tokens": 41196305.0,
"reward": 0.39300990104675293,
"reward_std": 0.26003241539001465,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7125464677810669,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.23746415972709656,
"step": 183
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7377312128864925,
"calib/avg_num_step_conf": 2.5390625,
"calib/ece": 0.21607142857142864,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.2206832298136645,
"calib/mean_conf": 0.8224206349206349,
"calib/mu_c": 0.9021118012422361,
"calib/mu_w": 0.6814285714285716,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19980158730158734,
"calib/std_conf": 0.26944109858361637,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7238983050847457,
"calib/step_q_c_n": 354.0,
"calib/step_q_gap": 0.27320573751717814,
"calib/step_q_w": 0.4506925675675676,
"calib/step_q_w_n": 296.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 482.55859375,
"completions/mean_terminated_length": 486.3582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.006096668541431427,
"kl": 0.062225341796875,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0204,
"num_tokens": 41425120.0,
"reward": 0.44166046380996704,
"reward_std": 0.2429095208644867,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7461289167404175,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.18390172719955444,
"step": 184
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7485714285714284,
"calib/avg_num_step_conf": 2.3359375,
"calib/ece": 0.26878431372549005,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.2520714285714285,
"calib/mean_conf": 0.7883921568627452,
"calib/mu_c": 0.9020714285714284,
"calib/mu_w": 0.6499999999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2540784313725489,
"calib/std_conf": 0.3005579177118272,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7370860927152318,
"calib/step_q_c_n": 302.0,
"calib/step_q_gap": 0.25073474136388046,
"calib/step_q_w": 0.4863513513513514,
"calib/step_q_w_n": 296.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2062.0,
"completions/max_terminated_length": 2062.0,
"completions/mean_length": 474.80859375,
"completions/mean_terminated_length": 474.80859375,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.006603894755244255,
"kl": 0.07291412353515625,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0034,
"num_tokens": 41653591.0,
"reward": 0.4074620008468628,
"reward_std": 0.1967330127954483,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7267429828643799,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.2204127013683319,
"step": 185
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7217297084318363,
"calib/avg_num_step_conf": 1.9921875,
"calib/ece": 0.18781249999999994,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.56640625,
"calib/gap": 0.24395849750459675,
"calib/mean_conf": 0.7678906250000002,
"calib/mu_c": 0.8574691358024693,
"calib/mu_w": 0.6135106382978726,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16144531249999994,
"calib/std_conf": 0.3046223387854367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7850961538461538,
"calib/step_q_c_n": 312.0,
"calib/step_q_gap": 0.1851971639471639,
"calib/step_q_w": 0.5998989898989899,
"calib/step_q_w_n": 198.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1387.0,
"completions/max_terminated_length": 1387.0,
"completions/mean_length": 460.98828125,
"completions/mean_terminated_length": 462.7961120605469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.1984,
"grad_norm": 0.005948403850197792,
"kl": 0.05864715576171875,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0043,
"num_tokens": 41876644.0,
"reward": 0.45981481671333313,
"reward_std": 0.19684401154518127,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7699711322784424,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.17690393328666687,
"step": 186
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7147108843537415,
"calib/avg_num_step_conf": 2.48828125,
"calib/ece": 0.21736220472440942,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5669291338582677,
"calib/gap": 0.19568681318681314,
"calib/mean_conf": 0.8216141732283464,
"calib/mu_c": 0.8971153846153845,
"calib/mu_w": 0.7014285714285714,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21240157480314958,
"calib/std_conf": 0.24712550757061275,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6770189701897019,
"calib/step_q_c_n": 369.0,
"calib/step_q_gap": 0.19981747765238855,
"calib/step_q_w": 0.47720149253731337,
"calib/step_q_w_n": 268.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 492.94140625,
"completions/mean_terminated_length": 492.94140625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.006126835942268372,
"kl": 0.057132720947265625,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0201,
"num_tokens": 42104381.0,
"reward": 0.45576226711273193,
"reward_std": 0.20533543825149536,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7420246601104736,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.1492500901222229,
"step": 187
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6952429149797571,
"calib/avg_num_step_conf": 1.88671875,
"calib/ece": 0.2142629482071714,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5936254980079682,
"calib/gap": 0.20449797570850203,
"calib/mean_conf": 0.7706772908366535,
"calib/mu_c": 0.8480769230769232,
"calib/mu_w": 0.6435789473684211,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.18171314741035866,
"calib/std_conf": 0.31263808015910155,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7659322033898307,
"calib/step_q_c_n": 295.0,
"calib/step_q_gap": 0.1443364587089797,
"calib/step_q_w": 0.621595744680851,
"calib/step_q_w_n": 188.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 526.2265625,
"completions/mean_terminated_length": 528.2902221679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.005998818203806877,
"kl": 0.052219390869140625,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0515,
"num_tokens": 42343167.0,
"reward": 0.40924012660980225,
"reward_std": 0.23015396296977997,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7154003977775574,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.21254518628120422,
"step": 188
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7343910256410255,
"calib/avg_num_step_conf": 1.6171875,
"calib/ece": 0.1864453124999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.49609375,
"calib/gap": 0.26669487179487195,
"calib/mean_conf": 0.7101171875,
"calib/mu_c": 0.8142948717948719,
"calib/mu_w": 0.5476,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1435937499999999,
"calib/std_conf": 0.3301461352826803,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7537130801687764,
"calib/step_q_c_n": 237.0,
"calib/step_q_gap": 0.24619895587499108,
"calib/step_q_w": 0.5075141242937853,
"calib/step_q_w_n": 177.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1131.0,
"completions/max_terminated_length": 1131.0,
"completions/mean_length": 430.1640625,
"completions/mean_terminated_length": 431.85101318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.2016,
"grad_norm": 0.006668528076261282,
"kl": 0.05899810791015625,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0135,
"num_tokens": 42561057.0,
"reward": 0.4447804391384125,
"reward_std": 0.2044180929660797,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7697839736938477,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.20209810137748718,
"step": 189
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7648008966244726,
"calib/avg_num_step_conf": 2.00390625,
"calib/ece": 0.19811023622047258,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5866141732283464,
"calib/gap": 0.2740479957805906,
"calib/mean_conf": 0.7869291338582678,
"calib/mu_c": 0.890506329113924,
"calib/mu_w": 0.6164583333333334,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18149606299212612,
"calib/std_conf": 0.2979682163756999,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7988996763754046,
"calib/step_q_c_n": 309.0,
"calib/step_q_gap": 0.23551732343422804,
"calib/step_q_w": 0.5633823529411766,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1692.0,
"completions/max_terminated_length": 1692.0,
"completions/mean_length": 510.4453125,
"completions/mean_terminated_length": 512.4470825195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.00550629198551178,
"kl": 0.04981231689453125,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.021,
"num_tokens": 42797339.0,
"reward": 0.4591637849807739,
"reward_std": 0.22176282107830048,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7685238718986511,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.17128996551036835,
"step": 190
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6950024801587301,
"calib/avg_num_step_conf": 2.30859375,
"calib/ece": 0.28672519685039377,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5708661417322834,
"calib/gap": 0.23468420138888857,
"calib/mean_conf": 0.7456228346456694,
"calib/mu_c": 0.8638888888888886,
"calib/mu_w": 0.6292046875,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.26814251968503944,
"calib/std_conf": 0.33022037921463554,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7361940298507463,
"calib/step_q_c_n": 268.0,
"calib/step_q_gap": 0.16311848805508067,
"calib/step_q_w": 0.5730755417956657,
"calib/step_q_w_n": 323.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 469.359375,
"completions/mean_terminated_length": 469.359375,
"completions/min_length": 111.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.006034213118255138,
"kl": 0.06505584716796875,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0111,
"num_tokens": 43021663.0,
"reward": 0.373573362827301,
"reward_std": 0.21615146100521088,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6873253583908081,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.23549112677574158,
"step": 191
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7284951363723059,
"calib/avg_num_step_conf": 2.4296875,
"calib/ece": 0.2149606299212598,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.531496062992126,
"calib/gap": 0.25306122448979596,
"calib/mean_conf": 0.7564566929133858,
"calib/mu_c": 0.8630612244897959,
"calib/mu_w": 0.61,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1963385826771653,
"calib/std_conf": 0.3208319096758122,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7225545171339564,
"calib/step_q_c_n": 321.0,
"calib/step_q_gap": 0.21444820484159755,
"calib/step_q_w": 0.5081063122923588,
"calib/step_q_w_n": 301.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 471.5625,
"completions/mean_terminated_length": 473.41180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.2048,
"grad_norm": 0.0063798511400818825,
"kl": 0.0574493408203125,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0172,
"num_tokens": 43247359.0,
"reward": 0.433309942483902,
"reward_std": 0.24963872134685516,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7353507876396179,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.18123087286949158,
"step": 192
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7555152902282408,
"calib/avg_num_step_conf": 2.28515625,
"calib/ece": 0.200511811023622,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.531496062992126,
"calib/gap": 0.2700521330027338,
"calib/mean_conf": 0.7575984251968504,
"calib/mu_c": 0.871360544217687,
"calib/mu_w": 0.6013084112149533,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1896850393700787,
"calib/std_conf": 0.31375321963633407,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7516428571428573,
"calib/step_q_c_n": 280.0,
"calib/step_q_gap": 0.27567564402810313,
"calib/step_q_w": 0.47596721311475415,
"calib/step_q_w_n": 305.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2612.0,
"completions/max_terminated_length": 2612.0,
"completions/mean_length": 476.28515625,
"completions/mean_terminated_length": 476.28515625,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.005859904922544956,
"kl": 0.055263519287109375,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0113,
"num_tokens": 43475000.0,
"reward": 0.4385659992694855,
"reward_std": 0.21207654476165771,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7476226687431335,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.1829906404018402,
"step": 193
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7885227485489609,
"calib/avg_num_step_conf": 2.046875,
"calib/ece": 0.22960937499999998,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": 0.30555451538413525,
"calib/mean_conf": 0.7755468750000001,
"calib/mu_c": 0.9056462585034013,
"calib/mu_w": 0.600091743119266,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21546875,
"calib/std_conf": 0.3153490342901566,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8037246963562752,
"calib/step_q_c_n": 247.0,
"calib/step_q_gap": 0.26437451585086014,
"calib/step_q_w": 0.5393501805054151,
"calib/step_q_w_n": 277.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1391.0,
"completions/max_terminated_length": 1391.0,
"completions/mean_length": 421.5703125,
"completions/mean_terminated_length": 423.2235412597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.006361325271427631,
"kl": 0.0576019287109375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0022,
"num_tokens": 43688866.0,
"reward": 0.4467424154281616,
"reward_std": 0.20001167058944702,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7649414539337158,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.1863003522157669,
"step": 194
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7467991464390504,
"calib/avg_num_step_conf": 2.15625,
"calib/ece": 0.1723137254901961,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5333333333333333,
"calib/gap": 0.264561883168845,
"calib/mean_conf": 0.7535686274509804,
"calib/mu_c": 0.8490184049079754,
"calib/mu_w": 0.5844565217391304,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14333333333333337,
"calib/std_conf": 0.3055785419336132,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.738676923076923,
"calib/step_q_c_n": 325.0,
"calib/step_q_gap": 0.24198088783463217,
"calib/step_q_w": 0.49669603524229083,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2406.0,
"completions/max_terminated_length": 2406.0,
"completions/mean_length": 475.8984375,
"completions/mean_terminated_length": 477.7647399902344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.208,
"grad_norm": 0.006363731808960438,
"kl": 0.0585174560546875,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0067,
"num_tokens": 43916680.0,
"reward": 0.4543408751487732,
"reward_std": 0.1945040225982666,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7743749618530273,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.18991197645664215,
"step": 195
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7149387278739545,
"calib/avg_num_step_conf": 2.140625,
"calib/ece": 0.2534375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.703125,
"calib/gap": 0.18491733125850995,
"calib/mean_conf": 0.8580468750000001,
"calib/mu_c": 0.9281132075471697,
"calib/mu_w": 0.7431958762886598,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24519531249999998,
"calib/std_conf": 0.24048635159346232,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7884272997032641,
"calib/step_q_c_n": 337.0,
"calib/step_q_gap": 0.15117611486914084,
"calib/step_q_w": 0.6372511848341232,
"calib/step_q_w_n": 211.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1004.0,
"completions/max_terminated_length": 1004.0,
"completions/mean_length": 392.52734375,
"completions/mean_terminated_length": 394.0666809082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.006647142581641674,
"kl": 0.06627655029296875,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0162,
"num_tokens": 44119711.0,
"reward": 0.41717860102653503,
"reward_std": 0.13773755729198456,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7377187609672546,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.22758033871650696,
"step": 196
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7500314228255405,
"calib/avg_num_step_conf": 1.95703125,
"calib/ece": 0.24754940711462442,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5849802371541502,
"calib/gap": 0.2424478381096029,
"calib/mean_conf": 0.762806324110672,
"calib/mu_c": 0.8749264705882354,
"calib/mu_w": 0.6324786324786325,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23640316205533587,
"calib/std_conf": 0.31291885769558153,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6909057971014493,
"calib/step_q_c_n": 276.0,
"calib/step_q_gap": 0.08872801932367147,
"calib/step_q_w": 0.6021777777777778,
"calib/step_q_w_n": 225.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 484.69140625,
"completions/mean_terminated_length": 484.69140625,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.006134700495749712,
"kl": 0.0559844970703125,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0136,
"num_tokens": 44348848.0,
"reward": 0.4062767028808594,
"reward_std": 0.21886281669139862,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7109090089797974,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.20148061215877533,
"step": 197
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7699800531914893,
"calib/avg_num_step_conf": 2.16796875,
"calib/ece": 0.16625984251968506,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5511811023622047,
"calib/gap": 0.29319015957446815,
"calib/mean_conf": 0.7625590551181102,
"calib/mu_c": 0.8710625000000001,
"calib/mu_w": 0.5778723404255319,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1494488188976378,
"calib/std_conf": 0.30623852454266404,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7793093093093093,
"calib/step_q_c_n": 333.0,
"calib/step_q_gap": 0.20872372372372383,
"calib/step_q_w": 0.5705855855855855,
"calib/step_q_w_n": 222.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2011.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 427.7421875,
"completions/mean_terminated_length": 427.7421875,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.2112,
"grad_norm": 0.006533287465572357,
"kl": 0.07459259033203125,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0232,
"num_tokens": 44563734.0,
"reward": 0.47105446457862854,
"reward_std": 0.19906282424926758,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7860128879547119,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.16734150052070618,
"step": 198
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7142857142857143,
"calib/avg_num_step_conf": 2.27734375,
"calib/ece": 0.234313725490196,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5647058823529412,
"calib/gap": 0.16734521575984984,
"calib/mean_conf": 0.7745490196078432,
"calib/mu_c": 0.8342682926829268,
"calib/mu_w": 0.666923076923077,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18286274509803915,
"calib/std_conf": 0.3003091422691147,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6988650306748466,
"calib/step_q_c_n": 326.0,
"calib/step_q_gap": 0.1422891551884652,
"calib/step_q_w": 0.5565758754863814,
"calib/step_q_w_n": 257.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1633.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 481.91796875,
"completions/mean_terminated_length": 483.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.006298987660557032,
"kl": 0.057342529296875,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0096,
"num_tokens": 44791305.0,
"reward": 0.42878156900405884,
"reward_std": 0.20725329220294952,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7290300130844116,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.19646695256233215,
"step": 199
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8179012345679012,
"calib/avg_num_step_conf": 2.08203125,
"calib/ece": 0.16921568627450984,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6078431372549019,
"calib/gap": 0.3307765830346474,
"calib/mean_conf": 0.8012156862745098,
"calib/mu_c": 0.9218518518518517,
"calib/mu_w": 0.5910752688172043,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16756862745098045,
"calib/std_conf": 0.28398096551274105,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7976415094339622,
"calib/step_q_c_n": 318.0,
"calib/step_q_gap": 0.25587406757349695,
"calib/step_q_w": 0.5417674418604652,
"calib/step_q_w_n": 215.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1656.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 468.3828125,
"completions/mean_terminated_length": 470.2196350097656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.006423901300877333,
"kl": 0.0532379150390625,
"learning_rate": 0.0,
"loss": 0.0273,
"num_tokens": 45019259.0,
"reward": 0.4917670786380768,
"reward_std": 0.1764676719903946,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.8102308511734009,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.15247796475887299,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.011295410779421217,
"train_runtime": 10938.5614,
"train_samples_per_second": 4.681,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 45019259,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}