Files
PureRL-1.5B-v7-s2-l2-kl-w1-b1/trainer_state.json
ModelHub XC 44c7fc73a7 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w1-b1
Source: Original Platform
2026-06-04 00:26:57 +08:00

11843 lines
475 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7490277290344238,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343300461769104,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04266543686389923,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.078,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.7281402349472046,
"reward_std": 0.16804265975952148,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7698483467102051,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345317482948303,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.03963975980877876,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.6806339025497437,
"reward_std": 0.16487614810466766,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7796330451965332,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7743935585021973,
"adv/std_final_conf": 0.9288880825042725,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.932464599609375,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4512251048218029,
"calib/avg_num_step_conf": 4.90625,
"calib/ece": 0.25462745098039213,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.30196078431372547,
"calib/gap": -0.006965408805031492,
"calib/mean_conf": 0.8781568627450981,
"calib/mu_c": 0.8755345911949685,
"calib/mu_w": 0.8825,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25462745098039213,
"calib/std_conf": 0.05181542375757666,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7867372881355932,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": 0.024255536310775594,
"calib/step_q_w": 0.7624817518248176,
"calib/step_q_w_n": 548.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1080.0,
"completions/max_terminated_length": 1080.0,
"completions/mean_length": 490.4765625,
"completions/mean_terminated_length": 492.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.05618441477417946,
"kl": 0.0011872649192810059,
"learning_rate": 7.5e-07,
"loss": -0.0191,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03281779587268829,
"mask/share_reasoning": 0.8549892902374268,
"mask/share_step_conf": 0.10828666388988495,
"num_tokens": 689479.0,
"reward": 0.7213048934936523,
"reward_std": 0.13984259963035583,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6905413866043091,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7520683407783508,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7581098079681396,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7545291185379028,
"adv/std_final_conf": 0.9290906190872192,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343487620353699,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4615896358543418,
"calib/avg_num_step_conf": 4.90234375,
"calib/ece": 0.20826771653543305,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2637795275590551,
"calib/gap": -0.0029355742296919285,
"calib/mean_conf": 0.8775590551181104,
"calib/mu_c": 0.8765882352941176,
"calib/mu_w": 0.8795238095238095,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20826771653543305,
"calib/std_conf": 0.048783896520646235,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.793607594936709,
"calib/step_q_c_n": 790.0,
"calib/step_q_gap": 0.006854906764665936,
"calib/step_q_w": 0.786752688172043,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 501.4609375,
"completions/mean_terminated_length": 503.427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.04018299654126167,
"kl": 0.000284343957901001,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0582,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03364722430706024,
"mask/share_reasoning": 0.849086344242096,
"mask/share_step_conf": 0.11336017400026321,
"num_tokens": 924021.0,
"reward": 0.7345324754714966,
"reward_std": 0.13953471183776855,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7220828533172607,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7469820976257324,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7413392066955566,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7512904405593872,
"adv/std_final_conf": 0.9298239946365356,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348950982093811,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4331679894179894,
"calib/avg_num_step_conf": 4.62109375,
"calib/ece": 0.33457489878542507,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3157894736842105,
"calib/gap": -0.00674735449735453,
"calib/mean_conf": 0.8811336032388664,
"calib/mu_c": 0.878074074074074,
"calib/mu_w": 0.8848214285714285,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.33457489878542507,
"calib/std_conf": 0.04259224764063817,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7943534482758621,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.0008832224031720681,
"calib/step_q_w": 0.79347022587269,
"calib/step_q_w_n": 487.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2593.0,
"completions/max_terminated_length": 2593.0,
"completions/mean_length": 513.0,
"completions/mean_terminated_length": 515.0117797851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.03751188516616821,
"kl": 0.0002925395965576172,
"learning_rate": 1.25e-06,
"loss": -0.0334,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.034141309559345245,
"mask/share_reasoning": 0.852514386177063,
"mask/share_step_conf": 0.10943801701068878,
"num_tokens": 1162037.0,
"reward": 0.6394835710525513,
"reward_std": 0.17741966247558594,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6105879545211792,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.6683791875839233,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7694669961929321,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7502487897872925,
"adv/std_final_conf": 0.9307279586791992,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933992326259613,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5237428348097969,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.2790513833992096,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2608695652173913,
"calib/gap": 0.0010552371026577578,
"calib/mean_conf": 0.8798418972332016,
"calib/mu_c": 0.880263157894737,
"calib/mu_w": 0.8792079207920792,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2790513833992096,
"calib/std_conf": 0.041168393988522095,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7931697054698457,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": -0.009380753245750606,
"calib/step_q_w": 0.8025504587155963,
"calib/step_q_w_n": 545.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2863.0,
"completions/max_terminated_length": 2863.0,
"completions/mean_length": 449.0234375,
"completions/mean_terminated_length": 450.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.0064,
"grad_norm": 0.05391272157430649,
"kl": 0.0003655552864074707,
"learning_rate": 1.5e-06,
"loss": -0.0082,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03775656595826149,
"mask/share_reasoning": 0.8353100419044495,
"mask/share_step_conf": 0.12302714586257935,
"num_tokens": 1382939.0,
"reward": 0.689347505569458,
"reward_std": 0.153395414352417,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6682343482971191,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7104606032371521,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7352026700973511,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7448313236236572,
"adv/std_final_conf": 0.9303086996078491,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341685175895691,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.49784187448616063,
"calib/avg_num_step_conf": 5.39453125,
"calib/ece": 0.24169960474308289,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.30039525691699603,
"calib/gap": -0.0010311044121676938,
"calib/mean_conf": 0.8822529644268774,
"calib/mu_c": 0.881890243902439,
"calib/mu_w": 0.8829213483146067,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23786561264822126,
"calib/std_conf": 0.04291805825539821,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7909617486338798,
"calib/step_q_c_n": 915.0,
"calib/step_q_gap": 0.005403808719716685,
"calib/step_q_w": 0.7855579399141631,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2242.0,
"completions/max_terminated_length": 2242.0,
"completions/mean_length": 525.84375,
"completions/mean_terminated_length": 527.9058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.060856908559799194,
"kl": 0.00033777952194213867,
"learning_rate": 1.75e-06,
"loss": -0.0041,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030560888350009918,
"mask/share_reasoning": 0.8551996946334839,
"mask/share_step_conf": 0.11033320426940918,
"num_tokens": 1624979.0,
"reward": 0.7218343019485474,
"reward_std": 0.17086157202720642,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7019370794296265,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7417315244674683,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7708044648170471,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7599292993545532,
"adv/std_final_conf": 0.9317449331283569,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343979954719543,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4968477531857814,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.29518218623481773,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.25101214574898784,
"calib/gap": 0.013062374245472741,
"calib/mean_conf": 0.870080971659919,
"calib/mu_c": 0.8756338028169014,
"calib/mu_w": 0.8625714285714287,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29518218623481773,
"calib/std_conf": 0.07809986290186875,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7903883495145632,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.026240084530335994,
"calib/step_q_w": 0.7641482649842272,
"calib/step_q_w_n": 634.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2577.0,
"completions/max_terminated_length": 2577.0,
"completions/mean_length": 536.0078125,
"completions/mean_terminated_length": 538.10986328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.043951455503702164,
"kl": 0.0003973245620727539,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0311,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03252224624156952,
"mask/share_reasoning": 0.860336184501648,
"mask/share_step_conf": 0.10323531180620193,
"num_tokens": 1868709.0,
"reward": 0.6831406354904175,
"reward_std": 0.1717548966407776,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6414449214935303,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7248363494873047,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7700271606445312,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7614004611968994,
"adv/std_final_conf": 0.9310073852539062,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345640540122986,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4621662701559609,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.2667588932806325,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": -0.006087761036214778,
"calib/mean_conf": 0.8833596837944664,
"calib/mu_c": 0.881025641025641,
"calib/mu_w": 0.8871134020618557,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2667588932806325,
"calib/std_conf": 0.04391170933248677,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7720178799489145,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.023901375094545596,
"calib/step_q_w": 0.7481165048543689,
"calib/step_q_w_n": 515.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2465.0,
"completions/max_terminated_length": 2465.0,
"completions/mean_length": 489.2734375,
"completions/mean_terminated_length": 493.1259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.0096,
"grad_norm": 0.03829352185130119,
"kl": 0.00040724873542785645,
"learning_rate": 2.25e-06,
"loss": -0.02,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03479592502117157,
"mask/share_reasoning": 0.8491111993789673,
"mask/share_step_conf": 0.10828033089637756,
"num_tokens": 2101499.0,
"reward": 0.6831810474395752,
"reward_std": 0.17485444247722626,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6700422167778015,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.6963199377059937,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7428078055381775,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7591912150382996,
"adv/std_final_conf": 0.9304345846176147,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342936873435974,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5017532467532467,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.27909448818897636,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3700787401574803,
"calib/gap": 0.017472727272727084,
"calib/mean_conf": 0.8853937007874015,
"calib/mu_c": 0.8922727272727271,
"calib/mu_w": 0.8748,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27909448818897636,
"calib/std_conf": 0.08844034935136104,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7868376536215461,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.0044474954493142205,
"calib/step_q_w": 0.7823901581722319,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 522.12890625,
"completions/mean_terminated_length": 522.12890625,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.03278151899576187,
"kl": 0.0004666447639465332,
"learning_rate": 2.5e-06,
"loss": 0.0261,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03185880556702614,
"mask/share_reasoning": 0.8579164743423462,
"mask/share_step_conf": 0.11022467166185379,
"num_tokens": 2341964.0,
"reward": 0.7027997970581055,
"reward_std": 0.162948340177536,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6785824298858643,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7270171642303467,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7659250497817993,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7720210552215576,
"adv/std_final_conf": 0.9277737736701965,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9336807727813721,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47283041401273884,
"calib/avg_num_step_conf": 5.43359375,
"calib/ece": 0.2828853754940712,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.35177865612648224,
"calib/gap": -0.019252255838641208,
"calib/mean_conf": 0.8794071146245058,
"calib/mu_c": 0.8721019108280256,
"calib/mu_w": 0.8913541666666668,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.27086956521739136,
"calib/std_conf": 0.09333544982717665,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7728014616321559,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.0013628651409278714,
"calib/step_q_w": 0.771438596491228,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 536.41796875,
"completions/mean_terminated_length": 536.41796875,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.04829133301973343,
"kl": 0.0008193850517272949,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0151,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0325528159737587,
"mask/share_reasoning": 0.8514052629470825,
"mask/share_step_conf": 0.11604189872741699,
"num_tokens": 2583767.0,
"reward": 0.7033008337020874,
"reward_std": 0.1320822387933731,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6716292500495911,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7349722981452942,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7572861909866333,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7546399831771851,
"adv/std_final_conf": 0.9274834394454956,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344656467437744,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5159368168676398,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.2420799999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.472,
"calib/gap": 0.004847330935759175,
"calib/mean_conf": 0.89408,
"calib/mu_c": 0.8957668711656441,
"calib/mu_w": 0.890919540229885,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2420799999999999,
"calib/std_conf": 0.05309005179880691,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7620438799076213,
"calib/step_q_c_n": 866.0,
"calib/step_q_gap": -0.0006497227859814103,
"calib/step_q_w": 0.7626936026936028,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2856.0,
"completions/max_terminated_length": 2856.0,
"completions/mean_length": 484.40625,
"completions/mean_terminated_length": 488.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0128,
"grad_norm": 0.04951918497681618,
"kl": 0.0016373395919799805,
"learning_rate": 3e-06,
"loss": 0.001,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0348307266831398,
"mask/share_reasoning": 0.8321601152420044,
"mask/share_step_conf": 0.12519662082195282,
"num_tokens": 2811951.0,
"reward": 0.72372967004776,
"reward_std": 0.17495250701904297,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6914949417114258,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7559643983840942,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7482670545578003,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.756947934627533,
"adv/std_final_conf": 0.9266350865364075,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343085289001465,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5362989801395598,
"calib/avg_num_step_conf": 4.8125,
"calib/ece": 0.26543307086614176,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5118110236220472,
"calib/gap": 0.0064251207729467685,
"calib/mean_conf": 0.9032283464566928,
"calib/mu_c": 0.9055555555555554,
"calib/mu_w": 0.8991304347826087,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26543307086614176,
"calib/std_conf": 0.04485114294251648,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7694840000000001,
"calib/step_q_c_n": 750.0,
"calib/step_q_gap": 0.018488149377593466,
"calib/step_q_w": 0.7509958506224066,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 486.62109375,
"completions/mean_terminated_length": 486.62109375,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.027871543541550636,
"kl": 0.0021517276763916016,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0376,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03444457799196243,
"mask/share_reasoning": 0.8526432514190674,
"mask/share_step_conf": 0.1129121482372284,
"num_tokens": 3041118.0,
"reward": 0.733703076839447,
"reward_std": 0.151658296585083,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6940249800682068,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7733811140060425,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7391790151596069,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7640714645385742,
"adv/std_final_conf": 0.9275602102279663,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341883063316345,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.38958755916159565,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.33461538461538465,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6963562753036437,
"calib/gap": -0.012419878296146103,
"calib/mean_conf": 0.9194736842105263,
"calib/mu_c": 0.9143448275862068,
"calib/mu_w": 0.9267647058823529,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.33352226720647776,
"calib/std_conf": 0.035939196230548164,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7463983628922237,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.03850947400333482,
"calib/step_q_w": 0.7078888888888889,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2752.0,
"completions/max_terminated_length": 2752.0,
"completions/mean_length": 554.06640625,
"completions/mean_terminated_length": 556.2392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.036938756704330444,
"kl": 0.00412750244140625,
"learning_rate": 3.5e-06,
"loss": 0.0002,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03238217532634735,
"mask/share_reasoning": 0.849589467048645,
"mask/share_step_conf": 0.11412206292152405,
"num_tokens": 3288359.0,
"reward": 0.6798146963119507,
"reward_std": 0.16921201348304749,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6168820858001709,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.74274742603302,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7596204280853271,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.75995934009552,
"adv/std_final_conf": 0.9207594394683838,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343336224555969,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.49663684243924106,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.33113281249999993,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.76953125,
"calib/gap": 0.0013852401802143532,
"calib/mean_conf": 0.9287890624999999,
"calib/mu_c": 0.9293464052287581,
"calib/mu_w": 0.9279611650485438,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33113281249999993,
"calib/std_conf": 0.04057807912372261,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7134652114597545,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": -0.007710125727143469,
"calib/step_q_w": 0.721175337186898,
"calib/step_q_w_n": 519.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1282.0,
"completions/max_terminated_length": 1282.0,
"completions/mean_length": 466.4375,
"completions/mean_terminated_length": 468.2666931152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.016,
"grad_norm": 0.035972610116004944,
"kl": 0.008241653442382812,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0063,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034142546355724335,
"mask/share_reasoning": 0.8487222790718079,
"mask/share_step_conf": 0.11322891712188721,
"num_tokens": 3515647.0,
"reward": 0.7133145332336426,
"reward_std": 0.1503565013408661,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6489074230194092,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.7777215242385864,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7493153214454651,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7484169006347656,
"adv/std_final_conf": 0.9216910600662231,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349498152732849,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5604395604395604,
"calib/avg_num_step_conf": 5.953125,
"calib/ece": 0.35545816733067703,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8764940239043825,
"calib/gap": 0.010113814756671835,
"calib/mean_conf": 0.9411155378486055,
"calib/mu_c": 0.9453061224489795,
"calib/mu_w": 0.9351923076923077,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.35545816733067703,
"calib/std_conf": 0.0362202013476857,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6621888888888889,
"calib/step_q_c_n": 900.0,
"calib/step_q_gap": 0.011259401709401762,
"calib/step_q_w": 0.6509294871794872,
"calib/step_q_w_n": 624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 635.55078125,
"completions/mean_terminated_length": 640.5551147460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.02724113129079342,
"kl": 0.009087562561035156,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0956,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.025532642379403114,
"mask/share_reasoning": 0.8610584735870361,
"mask/share_step_conf": 0.10559634864330292,
"num_tokens": 3787196.0,
"reward": 0.6936075687408447,
"reward_std": 0.16829201579093933,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6143804788589478,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7728347182273865,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7701978087425232,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7553335428237915,
"adv/std_final_conf": 0.9151206612586975,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934809684753418,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5359654731457801,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.21357142857142852,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9246031746031746,
"calib/gap": 0.004705882352941337,
"calib/mean_conf": 0.9437301587301588,
"calib/mu_c": 0.9450000000000002,
"calib/mu_w": 0.9402941176470588,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21357142857142852,
"calib/std_conf": 0.026359364077647593,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6380529300567107,
"calib/step_q_c_n": 1058.0,
"calib/step_q_gap": -0.004592572588791977,
"calib/step_q_w": 0.6426455026455027,
"calib/step_q_w_n": 378.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2565.0,
"completions/max_terminated_length": 2565.0,
"completions/mean_length": 534.19921875,
"completions/mean_terminated_length": 536.2941284179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.0346217043697834,
"kl": 0.013601303100585938,
"learning_rate": 4.25e-06,
"loss": 0.02,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03202342242002487,
"mask/share_reasoning": 0.8436535000801086,
"mask/share_step_conf": 0.1204168051481247,
"num_tokens": 4027479.0,
"reward": 0.7808581590652466,
"reward_std": 0.17788267135620117,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7462132573127747,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8155031204223633,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7502391934394836,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7626986503601074,
"adv/std_final_conf": 0.9088346362113953,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348260164260864,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.57067383739323,
"calib/avg_num_step_conf": 4.85546875,
"calib/ece": 0.38055118110236213,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9566929133858267,
"calib/gap": 0.007304017715912603,
"calib/mean_conf": 0.9514173228346455,
"calib/mu_c": 0.954551724137931,
"calib/mu_w": 0.9472477064220184,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.38055118110236213,
"calib/std_conf": 0.031050489984311938,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6232016348773842,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": -0.042810152941869184,
"calib/step_q_w": 0.6660117878192534,
"calib/step_q_w_n": 509.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1549.0,
"completions/max_terminated_length": 1549.0,
"completions/mean_length": 504.359375,
"completions/mean_terminated_length": 508.3307189941406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.0192,
"grad_norm": 0.029888764023780823,
"kl": 0.015348434448242188,
"learning_rate": 4.5e-06,
"loss": -0.0963,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03175097703933716,
"mask/share_reasoning": 0.8558838367462158,
"mask/share_step_conf": 0.10455270856618881,
"num_tokens": 4267315.0,
"reward": 0.6815993785858154,
"reward_std": 0.15395517647266388,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6041250228881836,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.759073793888092,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7441777586936951,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7759392857551575,
"adv/std_final_conf": 0.8993977904319763,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340354800224304,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4692715231788079,
"calib/avg_num_step_conf": 4.68359375,
"calib/ece": 0.35298804780876497,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9760956175298805,
"calib/gap": 0.013600000000000168,
"calib/mean_conf": 0.954581673306773,
"calib/mu_c": 0.9600000000000001,
"calib/mu_w": 0.9463999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.35298804780876497,
"calib/std_conf": 0.07735238031133577,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6428904109589041,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.009777404562315684,
"calib/step_q_w": 0.6331130063965884,
"calib/step_q_w_n": 469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1887.0,
"completions/max_terminated_length": 1887.0,
"completions/mean_length": 487.98828125,
"completions/mean_terminated_length": 489.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.020266666666666665,
"grad_norm": 10893.642578125,
"kl": 10240.019634246826,
"learning_rate": 4.75e-06,
"loss": 534.4695,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031817466020584106,
"mask/share_reasoning": 0.8552243709564209,
"mask/share_step_conf": 0.10905186831951141,
"num_tokens": 4497000.0,
"reward": 0.70301353931427,
"reward_std": 0.1717141568660736,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6160424947738647,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7899844646453857,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7263522744178772,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.754086971282959,
"adv/std_final_conf": 0.891075074672699,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345456957817078,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.47034706331045006,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.416984126984127,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9722222222222222,
"calib/gap": 0.000877192982455921,
"calib/mean_conf": 0.9646031746031747,
"calib/mu_c": 0.9649999999999999,
"calib/mu_w": 0.9641228070175439,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.416984126984127,
"calib/std_conf": 0.0233554586818447,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6103757225433526,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.030154115895153155,
"calib/step_q_w": 0.5802216066481994,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2558.0,
"completions/max_terminated_length": 2558.0,
"completions/mean_length": 492.3515625,
"completions/mean_terminated_length": 496.22833251953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.029597941786050797,
"kl": 0.023279190063476562,
"learning_rate": 5e-06,
"loss": -0.0616,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03490706533193588,
"mask/share_reasoning": 0.8316913843154907,
"mask/share_step_conf": 0.1255890280008316,
"num_tokens": 4727914.0,
"reward": 0.673703134059906,
"reward_std": 0.13693217933177948,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5692453384399414,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7781610488891602,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.6890993118286133,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7478216886520386,
"adv/std_final_conf": 0.8967701196670532,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352716207504272,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5542207792207793,
"calib/avg_num_step_conf": 5.76953125,
"calib/ece": 0.3635826771653544,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9960629921259843,
"calib/gap": 0.0032688311688313654,
"calib/mean_conf": 0.9698818897637796,
"calib/mu_c": 0.9711688311688312,
"calib/mu_w": 0.9678999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3635826771653544,
"calib/std_conf": 0.017534718843846057,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5828265765765765,
"calib/step_q_c_n": 888.0,
"calib/step_q_gap": -0.012793117820707023,
"calib/step_q_w": 0.5956196943972836,
"calib/step_q_w_n": 589.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1292.0,
"completions/max_terminated_length": 1292.0,
"completions/mean_length": 486.63671875,
"completions/mean_terminated_length": 490.468505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.0224,
"grad_norm": 0.01837567612528801,
"kl": 0.026103973388671875,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.026,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03301333636045456,
"mask/share_reasoning": 0.8315001726150513,
"mask/share_step_conf": 0.1276739537715912,
"num_tokens": 4955453.0,
"reward": 0.7198125720024109,
"reward_std": 0.18437299132347107,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6252046823501587,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8144204616546631,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.6960532665252686,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7869043946266174,
"adv/std_final_conf": 0.8804850578308105,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348463416099548,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47161892071952033,
"calib/avg_num_step_conf": 5.83984375,
"calib/ece": 0.34031620553359687,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9881422924901185,
"calib/gap": 0.016654896735510016,
"calib/mean_conf": 0.9648221343873518,
"calib/mu_c": 0.9710759493670886,
"calib/mu_w": 0.9544210526315786,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.34031620553359687,
"calib/std_conf": 0.07940283534646002,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6132855567805954,
"calib/step_q_c_n": 907.0,
"calib/step_q_gap": 0.03634678127039126,
"calib/step_q_w": 0.5769387755102041,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 484.32421875,
"completions/mean_terminated_length": 488.1377868652344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.023580927401781082,
"kl": 0.038417816162109375,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0517,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032605595886707306,
"mask/share_reasoning": 0.8292992115020752,
"mask/share_step_conf": 0.1302826702594757,
"num_tokens": 5181256.0,
"reward": 0.724952757358551,
"reward_std": 0.16283224523067474,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6435617208480835,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8063437938690186,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7656569480895996,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7499951124191284,
"adv/std_final_conf": 0.9161418080329895,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356306195259094,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5094650471637162,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.40884462151394435,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9840637450199203,
"calib/gap": -0.005951673342809216,
"calib/mean_conf": 0.9670916334661355,
"calib/mu_c": 0.964507042253521,
"calib/mu_w": 0.9704587155963302,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4050996015936256,
"calib/std_conf": 0.06065304938847371,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6222773536895674,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.0031504423745827292,
"calib/step_q_w": 0.6191269113149847,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 549.26171875,
"completions/mean_terminated_length": 551.4157104492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.022318795323371887,
"kl": 0.03360748291015625,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0417,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03272176906466484,
"mask/share_reasoning": 0.838639497756958,
"mask/share_step_conf": 0.12473248690366745,
"num_tokens": 5425803.0,
"reward": 0.661957859992981,
"reward_std": 0.21865949034690857,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5673785209655762,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7565371990203857,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7508785724639893,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7865563631057739,
"adv/std_final_conf": 0.9145070910453796,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9358435273170471,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5549862041781632,
"calib/avg_num_step_conf": 6.13671875,
"calib/ece": 0.45259109311740886,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": 0.004272106162133804,
"calib/mean_conf": 0.9748582995951417,
"calib/mu_c": 0.9768992248062016,
"calib/mu_w": 0.9726271186440678,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.45259109311740886,
"calib/std_conf": 0.01714076520831078,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6104135876042908,
"calib/step_q_c_n": 839.0,
"calib/step_q_gap": 0.013647603997733349,
"calib/step_q_w": 0.5967659836065574,
"calib/step_q_w_n": 732.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 611.875,
"completions/mean_terminated_length": 611.875,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.0256,
"grad_norm": 0.02459198795258999,
"kl": 0.032527923583984375,
"learning_rate": 4.888888888888889e-06,
"loss": -0.1041,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.029947519302368164,
"mask/share_reasoning": 0.8479431867599487,
"mask/share_step_conf": 0.1221093088388443,
"num_tokens": 5686955.0,
"reward": 0.63507080078125,
"reward_std": 0.24100688099861145,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5282472968101501,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7418943643569946,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7561845779418945,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7850910425186157,
"adv/std_final_conf": 0.8727063536643982,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347834587097168,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.44064102564102564,
"calib/avg_num_step_conf": 5.9375,
"calib/ece": 0.385236220472441,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0030628205128204744,
"calib/mean_conf": 0.9757874015748031,
"calib/mu_c": 0.9745333333333334,
"calib/mu_w": 0.9775961538461538,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.385236220472441,
"calib/std_conf": 0.015521450699946315,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6261529808773904,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": 0.03292001732746963,
"calib/step_q_w": 0.5932329635499207,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2777.0,
"completions/max_terminated_length": 2777.0,
"completions/mean_length": 492.82421875,
"completions/mean_terminated_length": 494.75689697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.020179076120257378,
"kl": 0.038753509521484375,
"learning_rate": 4.861111111111111e-06,
"loss": -0.043,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032434768974781036,
"mask/share_reasoning": 0.8340907096862793,
"mask/share_step_conf": 0.1295682191848755,
"num_tokens": 5916342.0,
"reward": 0.6886754035949707,
"reward_std": 0.14827287197113037,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6033198833465576,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7740308046340942,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7001343965530396,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7647371292114258,
"adv/std_final_conf": 0.8727540373802185,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355230331420898,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5272166105499438,
"calib/avg_num_step_conf": 5.83203125,
"calib/ece": 0.32838000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.996,
"calib/gap": 0.0009238215488213131,
"calib/mean_conf": 0.9748600000000002,
"calib/mu_c": 0.9751851851851852,
"calib/mu_w": 0.9742613636363638,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3276200000000001,
"calib/std_conf": 0.018397836829366656,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6341238317757009,
"calib/step_q_c_n": 856.0,
"calib/step_q_gap": 0.02394015830631313,
"calib/step_q_w": 0.6101836734693877,
"calib/step_q_w_n": 637.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 494.59375,
"completions/mean_terminated_length": 500.4585266113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.024296032264828682,
"kl": 0.043674468994140625,
"learning_rate": 4.833333333333333e-06,
"loss": -0.014,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.029950639232993126,
"mask/share_reasoning": 0.8423521518707275,
"mask/share_step_conf": 0.11597850918769836,
"num_tokens": 6148198.0,
"reward": 0.7104045152664185,
"reward_std": 0.16006594896316528,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6456553936004639,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.775153636932373,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7514786720275879,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7520699501037598,
"adv/std_final_conf": 0.9026116728782654,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352880716323853,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.49590769230769227,
"calib/avg_num_step_conf": 6.41796875,
"calib/ece": 0.4653333333333335,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.984313725490196,
"calib/gap": 0.0012892307692307092,
"calib/mean_conf": 0.9751372549019608,
"calib/mu_c": 0.9757692307692307,
"calib/mu_w": 0.97448,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4653333333333335,
"calib/std_conf": 0.022732044936042178,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6068387096774194,
"calib/step_q_c_n": 775.0,
"calib/step_q_gap": 0.0008364055299538986,
"calib/step_q_w": 0.6060023041474655,
"calib/step_q_w_n": 868.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2313.0,
"completions/max_terminated_length": 2313.0,
"completions/mean_length": 516.35546875,
"completions/mean_terminated_length": 516.35546875,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.0288,
"grad_norm": 0.021031202748417854,
"kl": 0.04630279541015625,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0115,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03199648857116699,
"mask/share_reasoning": 0.8362272381782532,
"mask/share_step_conf": 0.13177627325057983,
"num_tokens": 6385601.0,
"reward": 0.6513494253158569,
"reward_std": 0.18348070979118347,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5315262079238892,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7711726427078247,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7151632308959961,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.786544144153595,
"adv/std_final_conf": 0.8922801613807678,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356615543365479,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5129892037786774,
"calib/avg_num_step_conf": 5.5859375,
"calib/ece": 0.3602310756972111,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": -0.005878340080971323,
"calib/mean_conf": 0.9730517928286853,
"calib/mu_c": 0.9708269230769232,
"calib/mu_w": 0.9767052631578945,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.35588446215139435,
"calib/std_conf": 0.06291309147116418,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6100180995475114,
"calib/step_q_c_n": 884.0,
"calib/step_q_gap": 0.0076701141995260125,
"calib/step_q_w": 0.6023479853479854,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 562.87890625,
"completions/mean_terminated_length": 565.0863037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.023689938709139824,
"kl": 0.045177459716796875,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0802,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030054759234189987,
"mask/share_reasoning": 0.8516813516616821,
"mask/share_step_conf": 0.11435763537883759,
"num_tokens": 6636642.0,
"reward": 0.6863582134246826,
"reward_std": 0.20521211624145508,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6138904094696045,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.758825957775116,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7489525079727173,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7721630334854126,
"adv/std_final_conf": 0.9078423380851746,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354410171508789,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5187202604557977,
"calib/avg_num_step_conf": 6.43359375,
"calib/ece": 0.4936363636363637,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": 0.01489669421487616,
"calib/mean_conf": 0.971897233201581,
"calib/mu_c": 0.9796694214876035,
"calib/mu_w": 0.9647727272727273,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4936363636363637,
"calib/std_conf": 0.08247383890068877,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6070542635658915,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": 0.023874423932443656,
"calib/step_q_w": 0.5831798396334479,
"calib/step_q_w_n": 873.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2349.0,
"completions/max_terminated_length": 2349.0,
"completions/mean_length": 585.59375,
"completions/mean_terminated_length": 587.8902587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.024896398186683655,
"kl": 0.0430755615234375,
"learning_rate": 4.75e-06,
"loss": -0.0228,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.027839384973049164,
"mask/share_reasoning": 0.8492968082427979,
"mask/share_step_conf": 0.11895756423473358,
"num_tokens": 6893682.0,
"reward": 0.6369415521621704,
"reward_std": 0.20564672350883484,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.5014816522598267,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7724014520645142,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7562463283538818,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.750690221786499,
"adv/std_final_conf": 0.8935472965240479,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354106783866882,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5064412238325282,
"calib/avg_num_step_conf": 6.1328125,
"calib/ece": 0.4310799999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.984,
"calib/gap": 0.015848631239935407,
"calib/mean_conf": 0.97108,
"calib/mu_c": 0.9783703703703702,
"calib/mu_w": 0.9625217391304348,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4310799999999999,
"calib/std_conf": 0.08527621942839633,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5898925410872313,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.03692527536194257,
"calib/step_q_w": 0.5529672657252888,
"calib/step_q_w_n": 779.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2983.0,
"completions/max_terminated_length": 2983.0,
"completions/mean_length": 606.51171875,
"completions/mean_terminated_length": 606.51171875,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.032,
"grad_norm": 0.02309613674879074,
"kl": 0.0448760986328125,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0328,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.027759820222854614,
"mask/share_reasoning": 0.8556734323501587,
"mask/share_step_conf": 0.1165667176246643,
"num_tokens": 7155933.0,
"reward": 0.6598018407821655,
"reward_std": 0.19287419319152832,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5530972480773926,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7665064334869385,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7802824974060059,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7845057249069214,
"adv/std_final_conf": 0.9099698066711426,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353131055831909,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4822432962720733,
"calib/avg_num_step_conf": 7.1484375,
"calib/ece": 0.5296787148594378,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9799196787148594,
"calib/gap": 0.006041203400915407,
"calib/mean_conf": 0.9714457831325302,
"calib/mu_c": 0.9748181818181818,
"calib/mu_w": 0.9687769784172664,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.5296787148594378,
"calib/std_conf": 0.06643169992789887,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5463068493150685,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.040461394769613857,
"calib/step_q_w": 0.5058454545454546,
"calib/step_q_w_n": 1100.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2377.0,
"completions/max_terminated_length": 2377.0,
"completions/mean_length": 641.30859375,
"completions/mean_terminated_length": 643.8235473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.04329806938767433,
"kl": 0.044368743896484375,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0532,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.026912782341241837,
"mask/share_reasoning": 0.8497459888458252,
"mask/share_step_conf": 0.11943497508764267,
"num_tokens": 7426020.0,
"reward": 0.6124132871627808,
"reward_std": 0.18781393766403198,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.4585081934928894,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7663182616233826,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7666586637496948,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7693814039230347,
"adv/std_final_conf": 0.9167671799659729,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356940984725952,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47760025062656647,
"calib/avg_num_step_conf": 6.06640625,
"calib/ece": 0.5046996047430831,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": -0.003230576441102362,
"calib/mean_conf": 0.9770316205533598,
"calib/mu_c": 0.9753333333333335,
"calib/mu_w": 0.9785639097744359,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.503711462450593,
"calib/std_conf": 0.0259186738909823,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5566849710982659,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.044272659832296,
"calib/step_q_w": 0.5124123112659699,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 582.82421875,
"completions/mean_terminated_length": 582.82421875,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.03413167595863342,
"kl": 0.05112457275390625,
"learning_rate": 4.666666666666667e-06,
"loss": -0.058,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.028594808652997017,
"mask/share_reasoning": 0.854396641254425,
"mask/share_step_conf": 0.11700853705406189,
"num_tokens": 7681927.0,
"reward": 0.6248780488967896,
"reward_std": 0.19497355818748474,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.48585769534111023,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.763898491859436,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7173354029655457,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7580817341804504,
"adv/std_final_conf": 0.8851724863052368,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935470700263977,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4924653215636822,
"calib/avg_num_step_conf": 6.55859375,
"calib/ece": 0.458452380952381,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9880952380952381,
"calib/gap": 0.0009167717528373309,
"calib/mean_conf": 0.9743253968253969,
"calib/mu_c": 0.9747692307692308,
"calib/mu_w": 0.9738524590163935,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.458452380952381,
"calib/std_conf": 0.018255822887149574,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5375353418308226,
"calib/step_q_c_n": 863.0,
"calib/step_q_gap": -0.002783408169177326,
"calib/step_q_w": 0.54031875,
"calib/step_q_w_n": 816.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 574.7265625,
"completions/mean_terminated_length": 574.7265625,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0352,
"grad_norm": 0.01975160650908947,
"kl": 0.053974151611328125,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0549,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.028635134920477867,
"mask/share_reasoning": 0.8466184139251709,
"mask/share_step_conf": 0.12474644184112549,
"num_tokens": 7935929.0,
"reward": 0.655128538608551,
"reward_std": 0.17714199423789978,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5317574143409729,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7784996628761292,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7687841057777405,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7759872078895569,
"adv/std_final_conf": 0.9181791543960571,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351248145103455,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5015461315158399,
"calib/avg_num_step_conf": 6.55859375,
"calib/ece": 0.43280632411067205,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9723320158102767,
"calib/gap": -0.01255080146409182,
"calib/mean_conf": 0.966086956521739,
"calib/mu_c": 0.960431654676259,
"calib/mu_w": 0.9729824561403508,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4247430830039527,
"calib/std_conf": 0.07909133377861023,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5107519675925926,
"calib/step_q_c_n": 864.0,
"calib/step_q_gap": 0.02507221299136564,
"calib/step_q_w": 0.4856797546012269,
"calib/step_q_w_n": 815.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2563.0,
"completions/max_terminated_length": 2563.0,
"completions/mean_length": 515.11328125,
"completions/mean_terminated_length": 517.1333618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.020340269431471825,
"kl": 0.06337738037109375,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0421,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030964922159910202,
"mask/share_reasoning": 0.8273770809173584,
"mask/share_step_conf": 0.1377517580986023,
"num_tokens": 8172910.0,
"reward": 0.6818004250526428,
"reward_std": 0.1963166892528534,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5597124695777893,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8038883805274963,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7449852824211121,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.759742021560669,
"adv/std_final_conf": 0.8946696519851685,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355137348175049,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4529529529529529,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.4090627450980392,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": -0.0017976726726727232,
"calib/mean_conf": 0.9737686274509804,
"calib/mu_c": 0.972986111111111,
"calib/mu_w": 0.9747837837837837,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4090627450980392,
"calib/std_conf": 0.016829403426287484,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5247587064676617,
"calib/step_q_c_n": 804.0,
"calib/step_q_gap": 0.03468963739859254,
"calib/step_q_w": 0.4900690690690691,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2162.0,
"completions/max_terminated_length": 2162.0,
"completions/mean_length": 574.77734375,
"completions/mean_terminated_length": 574.77734375,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.019349532201886177,
"kl": 0.05916595458984375,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0447,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.02842956781387329,
"mask/share_reasoning": 0.8613673448562622,
"mask/share_step_conf": 0.11020311713218689,
"num_tokens": 8429309.0,
"reward": 0.6868324875831604,
"reward_std": 0.1873733252286911,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5831688046455383,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7904962301254272,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7299237847328186,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7725102305412292,
"adv/std_final_conf": 0.8623014092445374,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352320432662964,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4311440677966102,
"calib/avg_num_step_conf": 6.34765625,
"calib/ece": 0.20968525896414342,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": -0.004596133474576103,
"calib/mean_conf": 0.9723147410358566,
"calib/mu_c": 0.971234375,
"calib/mu_w": 0.9758305084745761,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20852988047808765,
"calib/std_conf": 0.019501775938964513,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4856729200652528,
"calib/step_q_c_n": 1226.0,
"calib/step_q_gap": 0.010750614300841788,
"calib/step_q_w": 0.474922305764411,
"calib/step_q_w_n": 399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2517.0,
"completions/max_terminated_length": 2517.0,
"completions/mean_length": 542.81640625,
"completions/mean_terminated_length": 542.81640625,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0384,
"grad_norm": 0.045901015400886536,
"kl": 0.062183380126953125,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0415,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03236595168709755,
"mask/share_reasoning": 0.8319715261459351,
"mask/share_step_conf": 0.1356624960899353,
"num_tokens": 8670982.0,
"reward": 0.7835397720336914,
"reward_std": 0.1716996133327484,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7599384784698486,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8071410655975342,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7526949048042297,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7663756608963013,
"adv/std_final_conf": 0.8950009346008301,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353793263435364,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5237055733678969,
"calib/avg_num_step_conf": 6.44140625,
"calib/ece": 0.5281967213114757,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9959016393442623,
"calib/gap": 0.00850262637287691,
"calib/mean_conf": 0.96672131147541,
"calib/mu_c": 0.9714953271028037,
"calib/mu_w": 0.9629927007299268,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5281967213114757,
"calib/std_conf": 0.06370967220686559,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5241205211726383,
"calib/step_q_c_n": 614.0,
"calib/step_q_gap": 0.08163356465089916,
"calib/step_q_w": 0.44248695652173914,
"calib/step_q_w_n": 1035.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2927.0,
"completions/max_terminated_length": 2927.0,
"completions/mean_length": 608.45703125,
"completions/mean_terminated_length": 608.45703125,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.030415819957852364,
"kl": 0.05530548095703125,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0411,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.028621872887015343,
"mask/share_reasoning": 0.8487331867218018,
"mask/share_step_conf": 0.12264493852853775,
"num_tokens": 8933843.0,
"reward": 0.6066591739654541,
"reward_std": 0.16862811148166656,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.4526539146900177,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7606644630432129,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.737849771976471,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7637723684310913,
"adv/std_final_conf": 0.9148930907249451,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354915022850037,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6269546027742748,
"calib/avg_num_step_conf": 6.27734375,
"calib/ece": 0.487936507936508,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9761904761904762,
"calib/gap": 0.006776796973518517,
"calib/mean_conf": 0.9641269841269843,
"calib/mu_c": 0.9676229508196723,
"calib/mu_w": 0.9608461538461538,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.48396825396825405,
"calib/std_conf": 0.08244545903990914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4998193548387097,
"calib/step_q_c_n": 775.0,
"calib/step_q_gap": 0.024038585607940477,
"calib/step_q_w": 0.47578076923076923,
"calib/step_q_w_n": 832.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2648.0,
"completions/max_terminated_length": 2648.0,
"completions/mean_length": 577.52734375,
"completions/mean_terminated_length": 577.52734375,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.02135089784860611,
"kl": 0.06496429443359375,
"learning_rate": 4.5e-06,
"loss": -0.0589,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02866470254957676,
"mask/share_reasoning": 0.849726676940918,
"mask/share_step_conf": 0.12160862237215042,
"num_tokens": 9188578.0,
"reward": 0.6399968862533569,
"reward_std": 0.1794516146183014,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.5083702802658081,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7716234922409058,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7724220752716064,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7736121416091919,
"adv/std_final_conf": 0.9094294309616089,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355300068855286,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4988117573483427,
"calib/avg_num_step_conf": 6.46875,
"calib/ece": 0.4747391304347827,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9802371541501976,
"calib/gap": 0.02197204502814265,
"calib/mean_conf": 0.960905138339921,
"calib/mu_c": 0.9721951219512196,
"calib/mu_w": 0.950223076923077,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4747391304347827,
"calib/std_conf": 0.0965838169105595,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5251319463087248,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.07652711645142518,
"calib/step_q_w": 0.44860482985729966,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 581.4140625,
"completions/mean_terminated_length": 581.4140625,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.0416,
"grad_norm": 0.02459811419248581,
"kl": 0.05318450927734375,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0369,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02947234734892845,
"mask/share_reasoning": 0.846280038356781,
"mask/share_step_conf": 0.12424758821725845,
"num_tokens": 9443508.0,
"reward": 0.6575571298599243,
"reward_std": 0.1858455240726471,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.5201391577720642,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7949751019477844,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7673207521438599,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7538126707077026,
"adv/std_final_conf": 0.911514163017273,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354920983314514,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5258962868117798,
"calib/avg_num_step_conf": 5.91015625,
"calib/ece": 0.523015873015873,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9880952380952381,
"calib/gap": 0.0213341869398207,
"calib/mean_conf": 0.9595238095238096,
"calib/mu_c": 0.9715454545454544,
"calib/mu_w": 0.9502112676056337,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.523015873015873,
"calib/std_conf": 0.10624438093718476,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5221390675241158,
"calib/step_q_c_n": 622.0,
"calib/step_q_gap": 0.035979696031410935,
"calib/step_q_w": 0.48615937149270483,
"calib/step_q_w_n": 891.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2511.0,
"completions/max_terminated_length": 2511.0,
"completions/mean_length": 621.14453125,
"completions/mean_terminated_length": 621.14453125,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.029524317011237144,
"kl": 0.06084442138671875,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0962,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.028228700160980225,
"mask/share_reasoning": 0.8570345640182495,
"mask/share_step_conf": 0.11473676562309265,
"num_tokens": 9709281.0,
"reward": 0.6209744215011597,
"reward_std": 0.1786309778690338,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.47219765186309814,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7697511315345764,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.6746046543121338,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7511892914772034,
"adv/std_final_conf": 0.8650860786437988,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935126781463623,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5606282552083334,
"calib/avg_num_step_conf": 5.78125,
"calib/ece": 0.2192226562499999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.99609375,
"calib/gap": 0.0058177083333335045,
"calib/mean_conf": 0.9692226562500001,
"calib/mu_c": 0.9706770833333334,
"calib/mu_w": 0.9648593749999999,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2192226562499999,
"calib/std_conf": 0.01716490658128777,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5236314847942756,
"calib/step_q_c_n": 1118.0,
"calib/step_q_gap": -0.01351768647644258,
"calib/step_q_w": 0.5371491712707182,
"calib/step_q_w_n": 362.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1476.0,
"completions/max_terminated_length": 1476.0,
"completions/mean_length": 501.53515625,
"completions/mean_terminated_length": 503.5019836425781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.03243149816989899,
"kl": 0.06836700439453125,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0395,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03247971460223198,
"mask/share_reasoning": 0.8331398963928223,
"mask/share_step_conf": 0.13047410547733307,
"num_tokens": 9944922.0,
"reward": 0.790048360824585,
"reward_std": 0.1570318639278412,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7657284736633301,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8143682479858398,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7103356122970581,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7571755647659302,
"adv/std_final_conf": 0.907038688659668,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935427188873291,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5013734466971878,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.4032128514056226,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.963855421686747,
"calib/gap": 0.014945716154349031,
"calib/mean_conf": 0.956706827309237,
"calib/mu_c": 0.9633093525179854,
"calib/mu_w": 0.9483636363636364,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.400843373493976,
"calib/std_conf": 0.08885934009920263,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49707305389221557,
"calib/step_q_c_n": 835.0,
"calib/step_q_gap": 0.02447208570411047,
"calib/step_q_w": 0.4726009681881051,
"calib/step_q_w_n": 723.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 512.11328125,
"completions/mean_terminated_length": 512.11328125,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.0448,
"grad_norm": 0.02139648236334324,
"kl": 0.06438064575195312,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0541,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03238638862967491,
"mask/share_reasoning": 0.8317169547080994,
"mask/share_step_conf": 0.13589666783809662,
"num_tokens": 10180391.0,
"reward": 0.6827822327613831,
"reward_std": 0.18697020411491394,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5778406262397766,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7877238988876343,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7501436471939087,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7426331043243408,
"adv/std_final_conf": 0.9032301306724548,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354005455970764,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5683174603174603,
"calib/avg_num_step_conf": 6.08984375,
"calib/ece": 0.4669322709163347,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.009430476190476322,
"calib/mean_conf": 0.9635059760956176,
"calib/mu_c": 0.96824,
"calib/mu_w": 0.9588095238095237,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.46621513944223114,
"calib/std_conf": 0.06264154461454985,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5180706319702602,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.003663717076643147,
"calib/step_q_w": 0.514406914893617,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 552.61328125,
"completions/mean_terminated_length": 554.7803955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.021705729886889458,
"kl": 0.060863494873046875,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0639,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030267784371972084,
"mask/share_reasoning": 0.8412609100341797,
"mask/share_step_conf": 0.12456506490707397,
"num_tokens": 10427084.0,
"reward": 0.6544273495674133,
"reward_std": 0.19733819365501404,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5236749649047852,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.785179615020752,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7124887108802795,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7413974404335022,
"adv/std_final_conf": 0.8838324546813965,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353353381156921,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5385459710743802,
"calib/avg_num_step_conf": 6.57421875,
"calib/ece": 0.47598393574297176,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.008950800619834665,
"calib/mean_conf": 0.9619277108433736,
"calib/mu_c": 0.9665289256198348,
"calib/mu_w": 0.9575781250000002,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.47598393574297176,
"calib/std_conf": 0.06288494941264759,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5137832818532818,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.002010434171162645,
"calib/step_q_w": 0.5117728476821192,
"calib/step_q_w_n": 906.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 586.66015625,
"completions/mean_terminated_length": 588.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03233075141906738,
"kl": 0.052997589111328125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0734,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.027658920735120773,
"mask/share_reasoning": 0.8431069850921631,
"mask/share_step_conf": 0.12532782554626465,
"num_tokens": 10683589.0,
"reward": 0.6391909122467041,
"reward_std": 0.17529763281345367,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.5095155835151672,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7688661813735962,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7306512594223022,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.75229811668396,
"adv/std_final_conf": 0.8930631279945374,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353950619697571,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5981738203145828,
"calib/avg_num_step_conf": 6.6328125,
"calib/ece": 0.4441183673469387,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9755102040816327,
"calib/gap": 0.03739562783257799,
"calib/mean_conf": 0.9502408163265307,
"calib/mu_c": 0.9687096774193549,
"calib/mu_w": 0.9313140495867769,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4441183673469387,
"calib/std_conf": 0.12407513839073704,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47270334928229657,
"calib/step_q_c_n": 836.0,
"calib/step_q_gap": 0.0007989409296283445,
"calib/step_q_w": 0.4719044083526682,
"calib/step_q_w_n": 862.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2427.0,
"completions/max_terminated_length": 2427.0,
"completions/mean_length": 596.7578125,
"completions/mean_terminated_length": 599.0980834960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.048,
"grad_norm": 0.023054039105772972,
"kl": 0.060699462890625,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0544,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02872752770781517,
"mask/share_reasoning": 0.8372390866279602,
"mask/share_step_conf": 0.13012711703777313,
"num_tokens": 10941407.0,
"reward": 0.6488115787506104,
"reward_std": 0.1848917305469513,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5322019457817078,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.7654211521148682,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7364819049835205,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7673929929733276,
"adv/std_final_conf": 0.8841802477836609,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350518584251404,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.524188137412775,
"calib/avg_num_step_conf": 7.16015625,
"calib/ece": 0.40813008130081296,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.991869918699187,
"calib/gap": 0.0017190016103058037,
"calib/mean_conf": 0.9677235772357724,
"calib/mu_c": 0.968478260869565,
"calib/mu_w": 0.9667592592592592,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4074390243902439,
"calib/std_conf": 0.020114024882143353,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4751558577405858,
"calib/step_q_c_n": 956.0,
"calib/step_q_gap": 0.06045802421721064,
"calib/step_q_w": 0.41469783352337514,
"calib/step_q_w_n": 877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2677.0,
"completions/max_terminated_length": 2677.0,
"completions/mean_length": 591.49609375,
"completions/mean_terminated_length": 596.153564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.020944081246852875,
"kl": 0.05698394775390625,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0826,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.030672479420900345,
"mask/share_reasoning": 0.8261983394622803,
"mask/share_step_conf": 0.13531672954559326,
"num_tokens": 11197598.0,
"reward": 0.6781916618347168,
"reward_std": 0.1851722002029419,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.565719485282898,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7906638383865356,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7587028741836548,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7555489540100098,
"adv/std_final_conf": 0.8831453323364258,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348834156990051,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5939169337606838,
"calib/avg_num_step_conf": 6.80859375,
"calib/ece": 0.3897580645161291,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9919354838709677,
"calib/gap": 0.006014957264957266,
"calib/mean_conf": 0.9623387096774194,
"calib/mu_c": 0.9648611111111111,
"calib/mu_w": 0.9588461538461538,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.385725806451613,
"calib/std_conf": 0.08767709149218292,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4837613160518444,
"calib/step_q_c_n": 1003.0,
"calib/step_q_gap": 0.039261316051844464,
"calib/step_q_w": 0.44449999999999995,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 620.75390625,
"completions/mean_terminated_length": 623.1882934570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.031001385301351547,
"kl": 0.054290771484375,
"learning_rate": 4.25e-06,
"loss": -0.0831,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.027538388967514038,
"mask/share_reasoning": 0.8438159823417664,
"mask/share_step_conf": 0.1247393935918808,
"num_tokens": 11462487.0,
"reward": 0.6905703544616699,
"reward_std": 0.18100741505622864,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5871163606643677,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7940243482589722,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7352026700973511,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7768785953521729,
"adv/std_final_conf": 0.8923227190971375,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934985339641571,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5392866290983607,
"calib/avg_num_step_conf": 6.50390625,
"calib/ece": 0.4834000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.984,
"calib/gap": 0.0008196721311477528,
"calib/mean_conf": 0.9554000000000001,
"calib/mu_c": 0.9558196721311478,
"calib/mu_w": 0.9550000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4754000000000001,
"calib/std_conf": 0.12248771366957585,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5137683029453015,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.04376200042429307,
"calib/step_q_w": 0.47000630252100845,
"calib/step_q_w_n": 952.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2722.0,
"completions/max_terminated_length": 2722.0,
"completions/mean_length": 548.0859375,
"completions/mean_terminated_length": 552.4015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0512,
"grad_norm": 0.021089090034365654,
"kl": 0.058563232421875,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0529,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031192826107144356,
"mask/share_reasoning": 0.826957106590271,
"mask/share_step_conf": 0.1340375393629074,
"num_tokens": 11706485.0,
"reward": 0.6443170309066772,
"reward_std": 0.1885329931974411,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.5049683451652527,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.783665657043457,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7109791040420532,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7729066610336304,
"adv/std_final_conf": 0.8787632584571838,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349692463874817,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6469702042182792,
"calib/avg_num_step_conf": 6.421875,
"calib/ece": 0.3863709677419357,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.005446267157683593,
"calib/mean_conf": 0.9710483870967744,
"calib/mu_c": 0.9733103448275862,
"calib/mu_w": 0.9678640776699026,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3863709677419357,
"calib/std_conf": 0.011381829632604772,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.53194055313987,
"calib/step_q_c_n": 922.0,
"calib/step_q_gap": 0.0627411071564905,
"calib/step_q_w": 0.4691994459833795,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2418.0,
"completions/max_terminated_length": 2418.0,
"completions/mean_length": 534.62109375,
"completions/mean_terminated_length": 540.9605102539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.044350311160087585,
"kl": 0.06148529052734375,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0972,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.029532913118600845,
"mask/share_reasoning": 0.8296065330505371,
"mask/share_step_conf": 0.12914179265499115,
"num_tokens": 11947884.0,
"reward": 0.6952216029167175,
"reward_std": 0.176457017660141,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5913281440734863,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7991151213645935,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7298943996429443,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7647660374641418,
"adv/std_final_conf": 0.8696848750114441,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347518086433411,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.567796052631579,
"calib/avg_num_step_conf": 6.09765625,
"calib/ece": 0.36623015873015874,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": 0.014100000000000223,
"calib/mean_conf": 0.969404761904762,
"calib/mu_c": 0.975,
"calib/mu_w": 0.9608999999999998,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36623015873015874,
"calib/std_conf": 0.059355356877224465,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5335596026490066,
"calib/step_q_c_n": 906.0,
"calib/step_q_gap": 0.02541667287801419,
"calib/step_q_w": 0.5081429297709924,
"calib/step_q_w_n": 655.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2129.0,
"completions/max_terminated_length": 2129.0,
"completions/mean_length": 551.9375,
"completions/mean_terminated_length": 556.283447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.030394205823540688,
"kl": 0.05947113037109375,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0736,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030669772997498512,
"mask/share_reasoning": 0.8324046730995178,
"mask/share_step_conf": 0.12911301851272583,
"num_tokens": 12194540.0,
"reward": 0.7142380475997925,
"reward_std": 0.1702766716480255,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6199074387550354,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8085687756538391,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7021123170852661,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7562159895896912,
"adv/std_final_conf": 0.8730514049530029,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346850514411926,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5702390545259199,
"calib/avg_num_step_conf": 6.3984375,
"calib/ece": 0.3886008064516131,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0035915928015042287,
"calib/mean_conf": 0.977310483870968,
"calib/mu_c": 0.9787876712328768,
"calib/mu_w": 0.9751960784313726,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3886008064516131,
"calib/std_conf": 0.01285826782186358,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.530438864628821,
"calib/step_q_c_n": 916.0,
"calib/step_q_gap": 0.028391773216078664,
"calib/step_q_w": 0.5020470914127423,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 594.25,
"completions/mean_terminated_length": 596.5804443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.0544,
"grad_norm": 0.02666338160634041,
"kl": 0.050075531005859375,
"learning_rate": 4.138888888888889e-06,
"loss": -0.1021,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02876748889684677,
"mask/share_reasoning": 0.8479665517807007,
"mask/share_step_conf": 0.11935971677303314,
"num_tokens": 12455964.0,
"reward": 0.688745379447937,
"reward_std": 0.16984319686889648,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5894192457199097,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7880715131759644,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.6500508785247803,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7584314942359924,
"adv/std_final_conf": 0.8351729512214661,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345820546150208,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6318421979570271,
"calib/avg_num_step_conf": 6.25,
"calib/ece": 0.31503571428571436,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": 0.009007749207467164,
"calib/mean_conf": 0.977734126984127,
"calib/mu_c": 0.9807724550898202,
"calib/mu_w": 0.9717647058823531,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.31503571428571436,
"calib/std_conf": 0.01906205264975339,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5362592592592592,
"calib/step_q_c_n": 1026.0,
"calib/step_q_gap": 0.07207459026971214,
"calib/step_q_w": 0.4641846689895471,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2510.0,
"completions/max_terminated_length": 2510.0,
"completions/mean_length": 547.9375,
"completions/mean_terminated_length": 552.251953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.03358688950538635,
"kl": 0.0588531494140625,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0612,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.029904557392001152,
"mask/share_reasoning": 0.8369725942611694,
"mask/share_step_conf": 0.12531036138534546,
"num_tokens": 12704188.0,
"reward": 0.7537912130355835,
"reward_std": 0.16553309559822083,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6702480316162109,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.837334394454956,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.6293481588363647,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7667917013168335,
"adv/std_final_conf": 0.8136529922485352,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934159517288208,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5346151405546038,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.38500000000000023,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.003469763309854712,
"calib/mean_conf": 0.9826562500000002,
"calib/mu_c": 0.9840522875816994,
"calib/mu_w": 0.9805825242718447,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38500000000000023,
"calib/std_conf": 0.012214513331995676,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5208728897715988,
"calib/step_q_c_n": 1007.0,
"calib/step_q_gap": 0.014289298440329556,
"calib/step_q_w": 0.5065835913312693,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1239.0,
"completions/max_terminated_length": 1239.0,
"completions/mean_length": 520.89453125,
"completions/mean_terminated_length": 522.937255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.02033303678035736,
"kl": 0.06036376953125,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0346,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03004293330013752,
"mask/share_reasoning": 0.8352757692337036,
"mask/share_step_conf": 0.1307750642299652,
"num_tokens": 12943361.0,
"reward": 0.7163116931915283,
"reward_std": 0.15128588676452637,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6128312349319458,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8197921514511108,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.5297777056694031,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7386770248413086,
"adv/std_final_conf": 0.7592636346817017,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339870810508728,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.62256006006006,
"calib/avg_num_step_conf": 6.09375,
"calib/ece": 0.26154724409448826,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9763779527559056,
"calib/gap": 0.05311576576576593,
"calib/mean_conf": 0.9702086614173229,
"calib/mu_c": 0.9856833333333334,
"calib/mu_w": 0.9325675675675674,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26154724409448826,
"calib/std_conf": 0.11417657862312314,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5778975069252077,
"calib/step_q_c_n": 1083.0,
"calib/step_q_gap": 0.081314697700889,
"calib/step_q_w": 0.4965828092243187,
"calib/step_q_w_n": 477.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1559.0,
"completions/max_terminated_length": 1559.0,
"completions/mean_length": 493.16796875,
"completions/mean_terminated_length": 495.10198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0576,
"grad_norm": 0.020669786259531975,
"kl": 0.05802154541015625,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0352,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03211697190999985,
"mask/share_reasoning": 0.8330959677696228,
"mask/share_step_conf": 0.13088083267211914,
"num_tokens": 13175844.0,
"reward": 0.7794877290725708,
"reward_std": 0.12982261180877686,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.728294312953949,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8306810259819031,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.6623827219009399,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7723299264907837,
"adv/std_final_conf": 0.8290383219718933,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347923398017883,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.61313240728941,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.44158964143426305,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9800796812749004,
"calib/gap": 0.03541691092137156,
"calib/mean_conf": 0.9714701195219123,
"calib/mu_c": 0.9881203007518798,
"calib/mu_w": 0.9527033898305083,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44158964143426305,
"calib/std_conf": 0.11249645458594695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5656493333333333,
"calib/step_q_c_n": 750.0,
"calib/step_q_gap": 0.020755089692101825,
"calib/step_q_w": 0.5448942436412315,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2187.0,
"completions/max_terminated_length": 2187.0,
"completions/mean_length": 539.51953125,
"completions/mean_terminated_length": 539.51953125,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.02158159762620926,
"kl": 0.051700592041015625,
"learning_rate": 4.027777777777779e-06,
"loss": -0.1093,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03160548582673073,
"mask/share_reasoning": 0.8430644273757935,
"mask/share_step_conf": 0.1253300905227661,
"num_tokens": 13421785.0,
"reward": 0.6731129884719849,
"reward_std": 0.1911894977092743,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5499264597892761,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7962995767593384,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.6451209187507629,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7391470670700073,
"adv/std_final_conf": 0.846241295337677,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934406578540802,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5542320539484464,
"calib/avg_num_step_conf": 6.48046875,
"calib/ece": 0.49583333333333346,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.006778218944980208,
"calib/mean_conf": 0.9839285714285716,
"calib/mu_c": 0.9873983739837399,
"calib/mu_w": 0.9806201550387597,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.49583333333333346,
"calib/std_conf": 0.039107641150699465,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5789016602809706,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.05683545023530845,
"calib/step_q_w": 0.5220662100456621,
"calib/step_q_w_n": 876.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2869.0,
"completions/max_terminated_length": 2869.0,
"completions/mean_length": 555.90625,
"completions/mean_terminated_length": 555.90625,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.021703317761421204,
"kl": 0.05384063720703125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0124,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030571769922971725,
"mask/share_reasoning": 0.8379673957824707,
"mask/share_step_conf": 0.13146084547042847,
"num_tokens": 13670937.0,
"reward": 0.6544367074966431,
"reward_std": 0.18766939640045166,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.4979339838027954,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.810939371585846,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.5386355519294739,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7594729661941528,
"adv/std_final_conf": 0.7653334736824036,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345945715904236,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6114978975666919,
"calib/avg_num_step_conf": 6.0390625,
"calib/ece": 0.33146031746031757,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": 0.02635913696836001,
"calib/mean_conf": 0.9782857142857143,
"calib/mu_c": 0.9875950920245399,
"calib/mu_w": 0.9612359550561799,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33146031746031757,
"calib/std_conf": 0.08794883587874432,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5837346723044398,
"calib/step_q_c_n": 946.0,
"calib/step_q_gap": 0.05833467230443978,
"calib/step_q_w": 0.5254,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2808.0,
"completions/max_terminated_length": 2808.0,
"completions/mean_length": 525.4609375,
"completions/mean_terminated_length": 529.5984497070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 202.0,
"epoch": 0.0608,
"grad_norm": 0.022320527583360672,
"kl": 0.0495452880859375,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0612,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030892664566636086,
"mask/share_reasoning": 0.8362057209014893,
"mask/share_step_conf": 0.1250891387462616,
"num_tokens": 13912247.0,
"reward": 0.735082745552063,
"reward_std": 0.15631389617919922,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6555935740470886,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8145719766616821,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.6802141666412354,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7571486830711365,
"adv/std_final_conf": 0.8551135063171387,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345582723617554,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5296961756815771,
"calib/avg_num_step_conf": 6.99609375,
"calib/ece": 0.5134645669291338,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9881889763779528,
"calib/gap": 0.021724374571089777,
"calib/mean_conf": 0.9740944881889764,
"calib/mu_c": 0.9858119658119657,
"calib/mu_w": 0.964087591240876,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5134645669291338,
"calib/std_conf": 0.10456399412444867,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5574524714828898,
"calib/step_q_c_n": 789.0,
"calib/step_q_gap": 0.03602123395793977,
"calib/step_q_w": 0.52143123752495,
"calib/step_q_w_n": 1002.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2614.0,
"completions/max_terminated_length": 2614.0,
"completions/mean_length": 593.2109375,
"completions/mean_terminated_length": 593.2109375,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.024006003513932228,
"kl": 0.04694366455078125,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0508,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02852752059698105,
"mask/share_reasoning": 0.8432724475860596,
"mask/share_step_conf": 0.12820005416870117,
"num_tokens": 14170429.0,
"reward": 0.6374404430389404,
"reward_std": 0.20299085974693298,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.48395466804504395,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7909262180328369,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.5568619966506958,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7574281096458435,
"adv/std_final_conf": 0.7681616544723511,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340794086456299,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.601207883026065,
"calib/avg_num_step_conf": 6.1171875,
"calib/ece": 0.41395256916996054,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.01675524475524459,
"calib/mean_conf": 0.9765612648221345,
"calib/mu_c": 0.9838461538461536,
"calib/mu_w": 0.967090909090909,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4126482213438736,
"calib/std_conf": 0.0853430255930299,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5790709134615385,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": 0.050446935259903625,
"calib/step_q_w": 0.5286239782016349,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2264.0,
"completions/max_terminated_length": 2264.0,
"completions/mean_length": 539.78515625,
"completions/mean_terminated_length": 539.78515625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.02217506803572178,
"kl": 0.05609893798828125,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0667,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03168308734893799,
"mask/share_reasoning": 0.842609703540802,
"mask/share_step_conf": 0.1257072240114212,
"num_tokens": 14414862.0,
"reward": 0.6876438856124878,
"reward_std": 0.1636931449174881,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5791339874267578,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7961536645889282,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.631127119064331,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.749966025352478,
"adv/std_final_conf": 0.8395038843154907,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346640706062317,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5691349934469201,
"calib/avg_num_step_conf": 5.74609375,
"calib/ece": 0.42136546184738954,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.009851245085190086,
"calib/mean_conf": 0.9797590361445785,
"calib/mu_c": 0.9840714285714285,
"calib/mu_w": 0.9742201834862384,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.41943775100401604,
"calib/std_conf": 0.06649532828258188,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5907919463087248,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.05000544493131431,
"calib/step_q_w": 0.5407865013774105,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 547.9765625,
"completions/mean_terminated_length": 550.1255493164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.064,
"grad_norm": 0.027142062783241272,
"kl": 0.049556732177734375,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0199,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031455203890800476,
"mask/share_reasoning": 0.8425741195678711,
"mask/share_step_conf": 0.12206438928842545,
"num_tokens": 14664000.0,
"reward": 0.6877095699310303,
"reward_std": 0.20329821109771729,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5601452589035034,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8152737617492676,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.5472803115844727,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.759605884552002,
"adv/std_final_conf": 0.7648850679397583,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337583780288696,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5260767854685565,
"calib/avg_num_step_conf": 5.953125,
"calib/ece": 0.3245098039215687,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.880418329469062e-05,
"calib/mean_conf": 0.9872549019607844,
"calib/mu_c": 0.9872781065088758,
"calib/mu_w": 0.9872093023255811,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3245098039215687,
"calib/std_conf": 0.008279062031875546,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5917596253902185,
"calib/step_q_c_n": 961.0,
"calib/step_q_gap": 0.04479692556783843,
"calib/step_q_w": 0.5469626998223801,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2885.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 464.95703125,
"completions/mean_terminated_length": 464.95703125,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.018482623621821404,
"kl": 0.0516357421875,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0108,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03660412132740021,
"mask/share_reasoning": 0.8234680891036987,
"mask/share_step_conf": 0.13992780447006226,
"num_tokens": 14887093.0,
"reward": 0.7381289005279541,
"reward_std": 0.15750961005687714,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.6685198545455933,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8077378273010254,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.585199773311615,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7739436626434326,
"adv/std_final_conf": 0.7961553335189819,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935192883014679,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5288201160541587,
"calib/avg_num_step_conf": 6.05859375,
"calib/ece": 0.42384860557768916,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9760956175298805,
"calib/gap": 0.007553449387492028,
"calib/mean_conf": 0.9806613545816734,
"calib/mu_c": 0.9839716312056737,
"calib/mu_w": 0.9764181818181816,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4213784860557768,
"calib/std_conf": 0.053871257743940894,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5654795396419438,
"calib/step_q_c_n": 782.0,
"calib/step_q_gap": 0.05029358385520777,
"calib/step_q_w": 0.515185955786736,
"calib/step_q_w_n": 769.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2692.0,
"completions/max_terminated_length": 2692.0,
"completions/mean_length": 537.0234375,
"completions/mean_terminated_length": 539.1294555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.022088345140218735,
"kl": 0.05084991455078125,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0193,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03091275691986084,
"mask/share_reasoning": 0.8408281803131104,
"mask/share_step_conf": 0.12435280531644821,
"num_tokens": 15131651.0,
"reward": 0.6745160818099976,
"reward_std": 0.2127370536327362,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5678347945213318,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7811973094940186,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.6797653436660767,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7620604634284973,
"adv/std_final_conf": 0.8699020147323608,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348795413970947,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6173139158576052,
"calib/avg_num_step_conf": 5.86328125,
"calib/ece": 0.3892055335968381,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.008766796116504882,
"calib/mean_conf": 0.9820909090909093,
"calib/mu_c": 0.98566,
"calib/mu_w": 0.9768932038834951,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3892055335968381,
"calib/std_conf": 0.019291287952678496,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5847171597633136,
"calib/step_q_c_n": 845.0,
"calib/step_q_gap": 0.04589703781209409,
"calib/step_q_w": 0.5388201219512195,
"calib/step_q_w_n": 656.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2445.0,
"completions/max_terminated_length": 2445.0,
"completions/mean_length": 575.84375,
"completions/mean_terminated_length": 575.84375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.0672,
"grad_norm": 0.02352093905210495,
"kl": 0.04439544677734375,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0372,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030028386041522026,
"mask/share_reasoning": 0.8535323143005371,
"mask/share_step_conf": 0.11643929779529572,
"num_tokens": 15387707.0,
"reward": 0.7122840285301208,
"reward_std": 0.21272410452365875,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6038464307785034,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8207215666770935,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7428898811340332,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7612664103507996,
"adv/std_final_conf": 0.8729450106620789,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934707522392273,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5357508641659199,
"calib/avg_num_step_conf": 5.82421875,
"calib/ece": 0.4023320158102767,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9881422924901185,
"calib/gap": 0.010471130457047617,
"calib/mean_conf": 0.9794071146245059,
"calib/mu_c": 0.983835616438356,
"calib/mu_w": 0.9733644859813084,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4023320158102767,
"calib/std_conf": 0.06388017027932623,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5645569620253165,
"calib/step_q_c_n": 869.0,
"calib/step_q_gap": 0.009211302861329318,
"calib/step_q_w": 0.5553456591639871,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 519.44921875,
"completions/mean_terminated_length": 519.44921875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.026329757645726204,
"kl": 0.050830841064453125,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0354,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0327141210436821,
"mask/share_reasoning": 0.8409000635147095,
"mask/share_step_conf": 0.12638577818870544,
"num_tokens": 15624462.0,
"reward": 0.6897340416908264,
"reward_std": 0.23057618737220764,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.584220290184021,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7952476739883423,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.5738707780838013,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7580282688140869,
"adv/std_final_conf": 0.8090132474899292,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344430565834045,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.511667917917918,
"calib/avg_num_step_conf": 5.26953125,
"calib/ece": 0.4193333333333335,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9803921568627451,
"calib/gap": 0.0030048798798798515,
"calib/mean_conf": 0.9840392156862745,
"calib/mu_c": 0.9853472222222224,
"calib/mu_w": 0.9823423423423425,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4193333333333335,
"calib/std_conf": 0.02298674876951273,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6007589743589744,
"calib/step_q_c_n": 780.0,
"calib/step_q_gap": 0.014379361002208157,
"calib/step_q_w": 0.5863796133567662,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2528.0,
"completions/max_terminated_length": 2528.0,
"completions/mean_length": 437.65234375,
"completions/mean_terminated_length": 437.65234375,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.03262828290462494,
"kl": 0.06385040283203125,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0167,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.037267137318849564,
"mask/share_reasoning": 0.8303743600845337,
"mask/share_step_conf": 0.13235852122306824,
"num_tokens": 15841525.0,
"reward": 0.6861559152603149,
"reward_std": 0.15331321954727173,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5766515135765076,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7956601977348328,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.6304284930229187,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7556469440460205,
"adv/std_final_conf": 0.8624138832092285,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9332277178764343,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6001536393316689,
"calib/avg_num_step_conf": 6.37890625,
"calib/ece": 0.46936000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.98,
"calib/gap": 0.014055438192177028,
"calib/mean_conf": 0.9708800000000001,
"calib/mu_c": 0.977795275590551,
"calib/mu_w": 0.963739837398374,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.46612000000000003,
"calib/std_conf": 0.08142251285731729,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5993985815602836,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.11795353845683526,
"calib/step_q_w": 0.4814450431034483,
"calib/step_q_w_n": 928.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2788.0,
"completions/max_terminated_length": 2788.0,
"completions/mean_length": 565.57421875,
"completions/mean_terminated_length": 567.7921752929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0704,
"grad_norm": 0.02561318129301071,
"kl": 0.04900360107421875,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0122,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030985288321971893,
"mask/share_reasoning": 0.8431658744812012,
"mask/share_step_conf": 0.12194260954856873,
"num_tokens": 16092664.0,
"reward": 0.6660435795783997,
"reward_std": 0.1502438485622406,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5234804749488831,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8086066246032715,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.6108351945877075,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7629013657569885,
"adv/std_final_conf": 0.8174425363540649,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9334231019020081,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6787662337662338,
"calib/avg_num_step_conf": 5.87890625,
"calib/ece": 0.37299212598425197,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.968503937007874,
"calib/gap": 0.010122077922077755,
"calib/mean_conf": 0.9739370078740157,
"calib/mu_c": 0.9779220779220779,
"calib/mu_w": 0.9678000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37031496062992125,
"calib/std_conf": 0.0479034047795392,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5916325802615934,
"calib/step_q_c_n": 841.0,
"calib/step_q_gap": 0.07866571279171386,
"calib/step_q_w": 0.5129668674698795,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 530.28515625,
"completions/mean_terminated_length": 530.28515625,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.025846382603049278,
"kl": 0.048938751220703125,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0253,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0322788842022419,
"mask/share_reasoning": 0.8486857414245605,
"mask/share_step_conf": 0.11903537809848785,
"num_tokens": 16333425.0,
"reward": 0.7303739786148071,
"reward_std": 0.13507795333862305,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6216679811477661,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8390799760818481,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.6445533037185669,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7679104804992676,
"adv/std_final_conf": 0.8494598865509033,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337493181228638,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6095303506017792,
"calib/avg_num_step_conf": 5.83984375,
"calib/ece": 0.3850199203187251,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9561752988047809,
"calib/gap": 0.02775379382522236,
"calib/mean_conf": 0.9706772908366533,
"calib/mu_c": 0.9821768707482993,
"calib/mu_w": 0.9544230769230769,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3850199203187251,
"calib/std_conf": 0.0737818702981451,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.577835064935065,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.0759178235557546,
"calib/step_q_w": 0.5019172413793104,
"calib/step_q_w_n": 725.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2757.0,
"completions/max_terminated_length": 2757.0,
"completions/mean_length": 511.71875,
"completions/mean_terminated_length": 511.71875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.024281207472085953,
"kl": 0.0457611083984375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.036,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03625905513763428,
"mask/share_reasoning": 0.8334545493125916,
"mask/share_step_conf": 0.13028644025325775,
"num_tokens": 16568513.0,
"reward": 0.713841438293457,
"reward_std": 0.16001370549201965,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6050695180892944,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8226132988929749,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7450152635574341,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7667806148529053,
"adv/std_final_conf": 0.8999899625778198,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348978400230408,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.585109819121447,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.4769196787148596,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9116465863453815,
"calib/gap": 0.01980600775193797,
"calib/mean_conf": 0.958847389558233,
"calib/mu_c": 0.9691083333333333,
"calib/mu_w": 0.9493023255813954,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.4769196787148596,
"calib/std_conf": 0.07261490550305577,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.564964560862866,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.04633939873575832,
"calib/step_q_w": 0.5186251621271076,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2557.0,
"completions/max_terminated_length": 2557.0,
"completions/mean_length": 576.28125,
"completions/mean_terminated_length": 576.28125,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.0736,
"grad_norm": 0.0308236014097929,
"kl": 0.047393798828125,
"learning_rate": 3.638888888888889e-06,
"loss": -0.1002,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030492305755615234,
"mask/share_reasoning": 0.8605618476867676,
"mask/share_step_conf": 0.10894586145877838,
"num_tokens": 16820537.0,
"reward": 0.6534982919692993,
"reward_std": 0.19954201579093933,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5091630816459656,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7978335618972778,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.6634742021560669,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7432050704956055,
"adv/std_final_conf": 0.8920466899871826,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345405697822571,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8211942257217847,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.4060728744939272,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8744939271255061,
"calib/gap": 0.11165485564304456,
"calib/mean_conf": 0.9202429149797571,
"calib/mu_c": 0.9744881889763778,
"calib/mu_w": 0.8628333333333332,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4060728744939272,
"calib/std_conf": 0.18078982613860656,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5868950867052023,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.08678325874821313,
"calib/step_q_w": 0.5001118279569892,
"calib/step_q_w_n": 744.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 549.66796875,
"completions/mean_terminated_length": 551.8235473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.026835061609745026,
"kl": 0.038166046142578125,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0659,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033462610095739365,
"mask/share_reasoning": 0.8393990993499756,
"mask/share_step_conf": 0.12323200702667236,
"num_tokens": 17068244.0,
"reward": 0.6982115507125854,
"reward_std": 0.17986395955085754,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5866332054138184,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.809789776802063,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7221082448959351,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7683535218238831,
"adv/std_final_conf": 0.897761881351471,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347231984138489,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6057759887547123,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.4014285714285715,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8174603174603174,
"calib/gap": 0.005144719187272551,
"calib/mean_conf": 0.9160317460317461,
"calib/mu_c": 0.9182978723404257,
"calib/mu_w": 0.9131531531531532,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37896825396825407,
"calib/std_conf": 0.1487998515375143,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5435220125786164,
"calib/step_q_c_n": 795.0,
"calib/step_q_gap": 0.01309999422999264,
"calib/step_q_w": 0.5304220183486238,
"calib/step_q_w_n": 763.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2561.0,
"completions/max_terminated_length": 2561.0,
"completions/mean_length": 524.51171875,
"completions/mean_terminated_length": 526.5686645507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.03249030187726021,
"kl": 0.04720306396484375,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0642,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03268418461084366,
"mask/share_reasoning": 0.8373799324035645,
"mask/share_step_conf": 0.1260296255350113,
"num_tokens": 17306927.0,
"reward": 0.696674108505249,
"reward_std": 0.1723061352968216,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5973577499389648,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7959904670715332,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7195709347724915,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7566730976104736,
"adv/std_final_conf": 0.9068205952644348,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341710805892944,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7483070672248215,
"calib/avg_num_step_conf": 5.5703125,
"calib/ece": 0.4055294117647057,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.792156862745098,
"calib/gap": 0.08504617089386846,
"calib/mean_conf": 0.9192549019607842,
"calib/mu_c": 0.9606106870229008,
"calib/mu_w": 0.8755645161290323,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4055294117647057,
"calib/std_conf": 0.12842573853830094,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5965980795610425,
"calib/step_q_c_n": 729.0,
"calib/step_q_gap": 0.0690299303501386,
"calib/step_q_w": 0.5275681492109039,
"calib/step_q_w_n": 697.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 490.640625,
"completions/mean_terminated_length": 490.640625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.0768,
"grad_norm": 0.05706058070063591,
"kl": 0.04985809326171875,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0217,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03344859182834625,
"mask/share_reasoning": 0.8429282903671265,
"mask/share_step_conf": 0.12362314760684967,
"num_tokens": 17536939.0,
"reward": 0.7275208234786987,
"reward_std": 0.16687637567520142,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6093425750732422,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8456990718841553,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7557467222213745,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7590437531471252,
"adv/std_final_conf": 0.9078731536865234,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344611167907715,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6708327530984544,
"calib/avg_num_step_conf": 5.53125,
"calib/ece": 0.26715415019762856,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8063241106719368,
"calib/gap": 0.0615018799610082,
"calib/mean_conf": 0.9267588932806324,
"calib/mu_c": 0.9476646706586825,
"calib/mu_w": 0.8861627906976743,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2669169960474309,
"calib/std_conf": 0.11452155660328063,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6067189384800965,
"calib/step_q_c_n": 829.0,
"calib/step_q_gap": 0.11199338481740484,
"calib/step_q_w": 0.49472555366269166,
"calib/step_q_w_n": 587.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 504.953125,
"completions/mean_terminated_length": 504.953125,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.04618416354060173,
"kl": 0.038722991943359375,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0321,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032340243458747864,
"mask/share_reasoning": 0.8512711524963379,
"mask/share_step_conf": 0.11638855934143066,
"num_tokens": 17773239.0,
"reward": 0.7775202393531799,
"reward_std": 0.1803993135690689,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7105652093887329,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8444753289222717,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7238792181015015,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7787186503410339,
"adv/std_final_conf": 0.912177324295044,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341882467269897,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6956156716417911,
"calib/avg_num_step_conf": 5.37890625,
"calib/ece": 0.34161023622047243,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.610236220472441,
"calib/gap": 0.0932620646766168,
"calib/mean_conf": 0.8501929133858268,
"calib/mu_c": 0.8942537313432835,
"calib/mu_w": 0.8009916666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33212204724409444,
"calib/std_conf": 0.20350657815253712,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5297560283687943,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.038605730749746714,
"calib/step_q_w": 0.4911502976190476,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1261.0,
"completions/max_terminated_length": 1261.0,
"completions/mean_length": 482.01171875,
"completions/mean_terminated_length": 483.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.043702512979507446,
"kl": 0.06292724609375,
"learning_rate": 3.5e-06,
"loss": -0.0573,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034550879150629044,
"mask/share_reasoning": 0.8362744450569153,
"mask/share_step_conf": 0.12526842951774597,
"num_tokens": 18000562.0,
"reward": 0.7414271235466003,
"reward_std": 0.15016880631446838,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6466495990753174,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8362046480178833,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7428607940673828,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7638285160064697,
"adv/std_final_conf": 0.9031973481178284,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933974027633667,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7541257805530777,
"calib/avg_num_step_conf": 5.6171875,
"calib/ece": 0.17786561264822126,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6482213438735178,
"calib/gap": 0.16685325602140932,
"calib/mean_conf": 0.8758102766798419,
"calib/mu_c": 0.9259322033898304,
"calib/mu_w": 0.7590789473684211,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17703557312252957,
"calib/std_conf": 0.1793572063880419,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5425856697819315,
"calib/step_q_c_n": 963.0,
"calib/step_q_gap": 0.07795409083456306,
"calib/step_q_w": 0.4646315789473684,
"calib/step_q_w_n": 475.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1871.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 477.46484375,
"completions/mean_terminated_length": 479.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.08,
"grad_norm": 0.05451728403568268,
"kl": 0.05059051513671875,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0453,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03529536724090576,
"mask/share_reasoning": 0.8326973915100098,
"mask/share_step_conf": 0.12810099124908447,
"num_tokens": 18227545.0,
"reward": 0.8223235607147217,
"reward_std": 0.134343683719635,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7874187231063843,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8572283387184143,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.6845461130142212,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7664108276367188,
"adv/std_final_conf": 0.8880889415740967,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9336568713188171,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6964849354375898,
"calib/avg_num_step_conf": 5.4765625,
"calib/ece": 0.18813492063492057,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6190476190476191,
"calib/gap": 0.15908034433285512,
"calib/mean_conf": 0.8217063492063492,
"calib/mu_c": 0.8734705882352942,
"calib/mu_w": 0.7143902439024391,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16761904761904756,
"calib/std_conf": 0.24617362460619346,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.50348623853211,
"calib/step_q_c_n": 872.0,
"calib/step_q_gap": 0.09935416306041195,
"calib/step_q_w": 0.4041320754716981,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2924.0,
"completions/max_terminated_length": 2924.0,
"completions/mean_length": 509.1171875,
"completions/mean_terminated_length": 509.1171875,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.04806952923536301,
"kl": 0.2750282287597656,
"learning_rate": 3.444444444444445e-06,
"loss": -0.013,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03508011996746063,
"mask/share_reasoning": 0.8423846960067749,
"mask/share_step_conf": 0.12253521382808685,
"num_tokens": 18460935.0,
"reward": 0.8059649467468262,
"reward_std": 0.15631715953350067,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7560847997665405,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8558450937271118,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7558318376541138,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7622382044792175,
"adv/std_final_conf": 0.926922082901001,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341432452201843,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.590406162464986,
"calib/avg_num_step_conf": 5.53515625,
"calib/ece": 0.19968379446640322,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5217391304347826,
"calib/gap": 0.12250070028011195,
"calib/mean_conf": 0.786403162055336,
"calib/mu_c": 0.8275595238095238,
"calib/mu_w": 0.7050588235294118,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16102766798418977,
"calib/std_conf": 0.25909409584904747,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49516896918172154,
"calib/step_q_c_n": 941.0,
"calib/step_q_gap": 0.04163955741701569,
"calib/step_q_w": 0.45352941176470585,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2104.0,
"completions/max_terminated_length": 2104.0,
"completions/mean_length": 477.453125,
"completions/mean_terminated_length": 479.3255310058594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.05456750467419624,
"kl": 0.0598602294921875,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0511,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037136346101760864,
"mask/share_reasoning": 0.8228158354759216,
"mask/share_step_conf": 0.1361415535211563,
"num_tokens": 18687827.0,
"reward": 0.7886154055595398,
"reward_std": 0.15151908993721008,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7406773567199707,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8365534543991089,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7504051923751831,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7541760206222534,
"adv/std_final_conf": 0.9153277277946472,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337966442108154,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6337736337736338,
"calib/avg_num_step_conf": 6.2890625,
"calib/ece": 0.19170916334661353,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4581673306772908,
"calib/gap": 0.14532808857808865,
"calib/mean_conf": 0.7227131474103586,
"calib/mu_c": 0.7852447552447553,
"calib/mu_w": 0.6399166666666667,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17235059760956176,
"calib/std_conf": 0.290884441579542,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4497088465845465,
"calib/step_q_c_n": 893.0,
"calib/step_q_gap": 0.027031022316764075,
"calib/step_q_w": 0.42267782426778244,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2564.0,
"completions/max_terminated_length": 2564.0,
"completions/mean_length": 560.30078125,
"completions/mean_terminated_length": 560.30078125,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.0832,
"grad_norm": 0.043129973113536835,
"kl": 0.06533050537109375,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0323,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030203985050320625,
"mask/share_reasoning": 0.847260594367981,
"mask/share_step_conf": 0.12253537029027939,
"num_tokens": 18939288.0,
"reward": 0.7746736407279968,
"reward_std": 0.16919748485088348,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7040666341781616,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.845280647277832,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.7007652521133423,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7351545095443726,
"adv/std_final_conf": 0.9115530848503113,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9309883713722229,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6958598726114651,
"calib/avg_num_step_conf": 6.41796875,
"calib/ece": 0.18126482213438738,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.47035573122529645,
"calib/gap": 0.18178144904458582,
"calib/mean_conf": 0.7593675889328063,
"calib/mu_c": 0.8283439490445859,
"calib/mu_w": 0.6465625,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16003952569169966,
"calib/std_conf": 0.26704871292707055,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4517631048387097,
"calib/step_q_c_n": 992.0,
"calib/step_q_gap": 0.043038988095238095,
"calib/step_q_w": 0.4087241167434716,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2536.0,
"completions/max_terminated_length": 2536.0,
"completions/mean_length": 548.93359375,
"completions/mean_terminated_length": 548.93359375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.03477565571665764,
"kl": 0.05063629150390625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0617,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030998801812529564,
"mask/share_reasoning": 0.8435428142547607,
"mask/share_step_conf": 0.12545835971832275,
"num_tokens": 19186191.0,
"reward": 0.8097378015518188,
"reward_std": 0.13792501389980316,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7506546974182129,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8688209056854248,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6384887099266052,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7562040090560913,
"adv/std_final_conf": 0.8373948931694031,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9335655570030212,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.688919259882254,
"calib/avg_num_step_conf": 6.0390625,
"calib/ece": 0.20148437500000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.71484375,
"calib/gap": 0.1692038127277823,
"calib/mean_conf": 0.8758593750000001,
"calib/mu_c": 0.9300574712643678,
"calib/mu_w": 0.7608536585365855,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19882812500000008,
"calib/std_conf": 0.21077619700670516,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4759594202898551,
"calib/step_q_c_n": 1035.0,
"calib/step_q_gap": 0.04247605434073576,
"calib/step_q_w": 0.43348336594911935,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1307.0,
"completions/max_terminated_length": 1307.0,
"completions/mean_length": 470.046875,
"completions/mean_terminated_length": 471.8902282714844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.11761131137609482,
"kl": 0.435760498046875,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0153,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034512780606746674,
"mask/share_reasoning": 0.8250635862350464,
"mask/share_step_conf": 0.13651734590530396,
"num_tokens": 19408683.0,
"reward": 0.8117129802703857,
"reward_std": 0.16109926998615265,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7730531096458435,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.850372850894928,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6516029238700867,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7672962546348572,
"adv/std_final_conf": 0.8549244999885559,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934212327003479,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7298333333333333,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.22283999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.648,
"calib/gap": 0.23826666666666663,
"calib/mean_conf": 0.8063600000000001,
"calib/mu_c": 0.9016666666666666,
"calib/mu_w": 0.6634,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21459999999999993,
"calib/std_conf": 0.28221968464300995,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49048453473132375,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": 0.11166582128103136,
"calib/step_q_w": 0.3788187134502924,
"calib/step_q_w_n": 855.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2883.0,
"completions/max_terminated_length": 2883.0,
"completions/mean_length": 535.36328125,
"completions/mean_terminated_length": 539.5787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0864,
"grad_norm": 0.4449962079524994,
"kl": 0.2868461608886719,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.0571,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03392023593187332,
"mask/share_reasoning": 0.8354710340499878,
"mask/share_step_conf": 0.12279621511697769,
"num_tokens": 19651984.0,
"reward": 0.7835407257080078,
"reward_std": 0.16362658143043518,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7345074415206909,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8325740098953247,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.5848948955535889,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7582812309265137,
"adv/std_final_conf": 0.8100059628486633,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9338537454605103,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.683876329787234,
"calib/avg_num_step_conf": 5.6953125,
"calib/ece": 0.2776771653543308,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7992125984251969,
"calib/gap": 0.14309042553191487,
"calib/mean_conf": 0.8927952755905513,
"calib/mu_c": 0.94575,
"calib/mu_w": 0.8026595744680851,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.27027559055118117,
"calib/std_conf": 0.21623400996211056,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5147575057736721,
"calib/step_q_c_n": 866.0,
"calib/step_q_gap": 0.09261223550340181,
"calib/step_q_w": 0.42214527027027027,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2854.0,
"completions/max_terminated_length": 2854.0,
"completions/mean_length": 474.65234375,
"completions/mean_terminated_length": 474.65234375,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.03431045264005661,
"kl": 0.057056427001953125,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0801,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03597622364759445,
"mask/share_reasoning": 0.8335458040237427,
"mask/share_step_conf": 0.1304779350757599,
"num_tokens": 19879047.0,
"reward": 0.7802107334136963,
"reward_std": 0.14769387245178223,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7121269702911377,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8482944369316101,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.5824180245399475,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7441925406455994,
"adv/std_final_conf": 0.8023613691329956,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9336576461791992,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6964833759590793,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.3376095617529881,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7649402390438247,
"calib/gap": 0.18492710997442463,
"calib/mean_conf": 0.8492430278884463,
"calib/mu_c": 0.9339705882352942,
"calib/mu_w": 0.7490434782608696,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3225099601593626,
"calib/std_conf": 0.27422877132987855,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5241968162083936,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.11191183546384942,
"calib/step_q_w": 0.4122849807445442,
"calib/step_q_w_n": 779.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2663.0,
"completions/max_terminated_length": 2663.0,
"completions/mean_length": 529.0546875,
"completions/mean_terminated_length": 531.1294555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.0549030564725399,
"kl": 0.053955078125,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0636,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03301708400249481,
"mask/share_reasoning": 0.8444280624389648,
"mask/share_step_conf": 0.11864862591028214,
"num_tokens": 20121749.0,
"reward": 0.7538405060768127,
"reward_std": 0.16312643885612488,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6607023477554321,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8469786047935486,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.595024585723877,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7575274705886841,
"adv/std_final_conf": 0.8171303272247314,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340332746505737,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7007662835249041,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.30197628458498027,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7351778656126482,
"calib/gap": 0.1695472541507025,
"calib/mean_conf": 0.8494861660079052,
"calib/mu_c": 0.9218620689655171,
"calib/mu_w": 0.7523148148148147,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2891699604743083,
"calib/std_conf": 0.2684211869489573,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5160706806282723,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.06495401396160566,
"calib/step_q_w": 0.45111666666666667,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2568.0,
"completions/max_terminated_length": 2568.0,
"completions/mean_length": 477.73828125,
"completions/mean_terminated_length": 479.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0896,
"grad_norm": 0.02916264347732067,
"kl": 0.05471038818359375,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.0731,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03586849942803383,
"mask/share_reasoning": 0.8404359221458435,
"mask/share_step_conf": 0.11978927254676819,
"num_tokens": 20349970.0,
"reward": 0.7618058919906616,
"reward_std": 0.1683914214372635,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6817960739135742,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8418155908584595,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6161090135574341,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7381773591041565,
"adv/std_final_conf": 0.8443658351898193,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348047375679016,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7081042157632889,
"calib/avg_num_step_conf": 5.51171875,
"calib/ece": 0.31729838709677416,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7661290322580645,
"calib/gap": 0.22803482587064694,
"calib/mean_conf": 0.8498790322580647,
"calib/mu_c": 0.9547014925373136,
"calib/mu_w": 0.7266666666666667,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31342741935483864,
"calib/std_conf": 0.2724685271183283,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5175361527967258,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.07892258347519193,
"calib/step_q_w": 0.4386135693215339,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 543.35546875,
"completions/mean_terminated_length": 547.6338500976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.03892885521054268,
"kl": 0.055267333984375,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.1226,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03346104174852371,
"mask/share_reasoning": 0.837459921836853,
"mask/share_step_conf": 0.12126647680997849,
"num_tokens": 20596893.0,
"reward": 0.7521539926528931,
"reward_std": 0.21601596474647522,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6710773706436157,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8332306742668152,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.5772691965103149,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7407020330429077,
"adv/std_final_conf": 0.8088988661766052,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9334074258804321,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6682353694455115,
"calib/avg_num_step_conf": 5.86328125,
"calib/ece": 0.2626294820717131,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.649402390438247,
"calib/gap": 0.2195722883851967,
"calib/mean_conf": 0.7728286852589642,
"calib/mu_c": 0.8725547445255476,
"calib/mu_w": 0.6529824561403509,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24482071713147407,
"calib/std_conf": 0.32558431465607224,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5037477303988996,
"calib/step_q_c_n": 727.0,
"calib/step_q_gap": 0.08956168388727176,
"calib/step_q_w": 0.4141860465116279,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 511.859375,
"completions/mean_terminated_length": 517.9288940429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.04384787008166313,
"kl": 0.05931854248046875,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0858,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03420304134488106,
"mask/share_reasoning": 0.8318629860877991,
"mask/share_step_conf": 0.12221519649028778,
"num_tokens": 20833441.0,
"reward": 0.7689346075057983,
"reward_std": 0.14764101803302765,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6896851062774658,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8481841087341309,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.5547035932540894,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7570780515670776,
"adv/std_final_conf": 0.7982771396636963,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9332447052001953,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5813353313353313,
"calib/avg_num_step_conf": 5.5078125,
"calib/ece": 0.21056451612903215,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7782258064516129,
"calib/gap": 0.14928404928404915,
"calib/mean_conf": 0.8651612903225808,
"calib/mu_c": 0.9048901098901099,
"calib/mu_w": 0.7556060606060607,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1709274193548386,
"calib/std_conf": 0.25982774650987445,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5179849537037037,
"calib/step_q_c_n": 864.0,
"calib/step_q_gap": 0.0944153566341066,
"calib/step_q_w": 0.42356959706959707,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2828.0,
"completions/max_terminated_length": 2828.0,
"completions/mean_length": 465.76171875,
"completions/mean_terminated_length": 471.28460693359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0928,
"grad_norm": 0.03590984642505646,
"kl": 0.06134796142578125,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0383,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.037011705338954926,
"mask/share_reasoning": 0.829144299030304,
"mask/share_step_conf": 0.12212523818016052,
"num_tokens": 21058172.0,
"reward": 0.7914502024650574,
"reward_std": 0.1662648767232895,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7531968951225281,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8297035694122314,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6191021203994751,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7256580591201782,
"adv/std_final_conf": 0.8432807922363281,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933613121509552,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.772495361781076,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.1913492063492063,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6468253968253969,
"calib/gap": 0.31662337662337636,
"calib/mean_conf": 0.7706349206349208,
"calib/mu_c": 0.8937662337662337,
"calib/mu_w": 0.5771428571428573,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1754365079365079,
"calib/std_conf": 0.3207883296370722,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47935856992639325,
"calib/step_q_c_n": 951.0,
"calib/step_q_gap": 0.06557705732135127,
"calib/step_q_w": 0.413781512605042,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2173.0,
"completions/max_terminated_length": 2173.0,
"completions/mean_length": 512.1640625,
"completions/mean_terminated_length": 514.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.04056946933269501,
"kl": 0.0717620849609375,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0657,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03307269513607025,
"mask/share_reasoning": 0.8433363437652588,
"mask/share_step_conf": 0.11968475580215454,
"num_tokens": 21299134.0,
"reward": 0.8107873201370239,
"reward_std": 0.16860932111740112,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.768801212310791,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8527735471725464,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.6300060749053955,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7494813203811646,
"adv/std_final_conf": 0.849608838558197,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.932580828666687,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7548370832545535,
"calib/avg_num_step_conf": 5.5546875,
"calib/ece": 0.18722222222222218,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.49206349206349204,
"calib/gap": 0.2934959349593498,
"calib/mean_conf": 0.6700793650793652,
"calib/mu_c": 0.8133333333333335,
"calib/mu_w": 0.5198373983739837,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17269841269841263,
"calib/std_conf": 0.3497663529239085,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49108462623413257,
"calib/step_q_c_n": 709.0,
"calib/step_q_gap": 0.0835109936955632,
"calib/step_q_w": 0.4075736325385694,
"calib/step_q_w_n": 713.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 514.46875,
"completions/mean_terminated_length": 516.486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.042645957320928574,
"kl": 0.063812255859375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0877,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03544268012046814,
"mask/share_reasoning": 0.8371367454528809,
"mask/share_step_conf": 0.12351429462432861,
"num_tokens": 21539726.0,
"reward": 0.786715030670166,
"reward_std": 0.16263370215892792,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7260781526565552,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8473520278930664,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.5670766830444336,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7358335256576538,
"adv/std_final_conf": 0.8081160187721252,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9330614805221558,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7155321125265394,
"calib/avg_num_step_conf": 5.81640625,
"calib/ece": 0.20063241106719354,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.616600790513834,
"calib/gap": 0.27885748407643296,
"calib/mean_conf": 0.7399209486166008,
"calib/mu_c": 0.845732484076433,
"calib/mu_w": 0.566875,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15999999999999984,
"calib/std_conf": 0.3403311478451047,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4967665505226481,
"calib/step_q_c_n": 861.0,
"calib/step_q_gap": 0.08776495816596014,
"calib/step_q_w": 0.409001592356688,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2710.0,
"completions/max_terminated_length": 2710.0,
"completions/mean_length": 485.67578125,
"completions/mean_terminated_length": 487.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.096,
"grad_norm": 0.045059166848659515,
"kl": 0.06566619873046875,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0261,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034171998500823975,
"mask/share_reasoning": 0.8349112272262573,
"mask/share_step_conf": 0.1270105242729187,
"num_tokens": 21767379.0,
"reward": 0.8125478029251099,
"reward_std": 0.12349230796098709,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7568085789680481,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8682869672775269,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.6261551380157471,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7426559329032898,
"adv/std_final_conf": 0.8279377222061157,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9330735206604004,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6463037634408602,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.24760956175298796,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6135458167330677,
"calib/gap": 0.1969865591397848,
"calib/mean_conf": 0.7395617529880478,
"calib/mu_c": 0.8149032258064516,
"calib/mu_w": 0.6179166666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18482071713147402,
"calib/std_conf": 0.33865595483960037,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4733818181818181,
"calib/step_q_c_n": 825.0,
"calib/step_q_gap": 0.05048478502300491,
"calib/step_q_w": 0.4228970331588132,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 500.83203125,
"completions/mean_terminated_length": 500.83203125,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.059418559074401855,
"kl": 0.0816192626953125,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.019,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033103737980127335,
"mask/share_reasoning": 0.8485387563705444,
"mask/share_step_conf": 0.11835750937461853,
"num_tokens": 22003304.0,
"reward": 0.7714670300483704,
"reward_std": 0.1684153825044632,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7130800485610962,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8298540711402893,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.5845734477043152,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7730364799499512,
"adv/std_final_conf": 0.8114038705825806,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9334744215011597,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7061542357857414,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.2132941176470587,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.615686274509804,
"calib/gap": 0.25577117892870704,
"calib/mean_conf": 0.7429411764705883,
"calib/mu_c": 0.849261744966443,
"calib/mu_w": 0.593490566037736,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18596078431372534,
"calib/std_conf": 0.3379447345330534,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5279789621318374,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.08138666822502727,
"calib/step_q_w": 0.4465922939068101,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 462.87890625,
"completions/mean_terminated_length": 462.87890625,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.045869529247283936,
"kl": 0.066070556640625,
"learning_rate": 3e-06,
"loss": 0.0002,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.036917682737112045,
"mask/share_reasoning": 0.847240149974823,
"mask/share_step_conf": 0.11584216356277466,
"num_tokens": 22228521.0,
"reward": 0.8019878268241882,
"reward_std": 0.14549410343170166,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7390902042388916,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8648854494094849,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.654515266418457,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7242127656936646,
"adv/std_final_conf": 0.8611568212509155,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343360662460327,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7357438568376068,
"calib/avg_num_step_conf": 6.22265625,
"calib/ece": 0.1570564516129033,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5040322580645161,
"calib/gap": 0.29607371794871806,
"calib/mean_conf": 0.6822983870967742,
"calib/mu_c": 0.8064583333333334,
"calib/mu_w": 0.5103846153846153,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12935483870967748,
"calib/std_conf": 0.3451600255290363,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.519379042690815,
"calib/step_q_c_n": 773.0,
"calib/step_q_gap": 0.09700099391032718,
"calib/step_q_w": 0.42237804878048785,
"calib/step_q_w_n": 820.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3043.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 528.12890625,
"completions/mean_terminated_length": 530.2000122070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0992,
"grad_norm": 0.04389479383826256,
"kl": 0.0801239013671875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0759,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.033197760581970215,
"mask/share_reasoning": 0.8333528637886047,
"mask/share_step_conf": 0.12954315543174744,
"num_tokens": 22469498.0,
"reward": 0.7923904657363892,
"reward_std": 0.1726185381412506,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7471199035644531,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.83766108751297,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.5801839828491211,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7660778760910034,
"adv/std_final_conf": 0.8194279074668884,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933512806892395,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7599281452492462,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.17992063492063484,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5674603174603174,
"calib/gap": 0.3208494258035539,
"calib/mean_conf": 0.7055555555555556,
"calib/mu_c": 0.8443356643356642,
"calib/mu_w": 0.5234862385321103,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15900793650793643,
"calib/std_conf": 0.34628540287877224,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5386538461538461,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.1127048828364618,
"calib/step_q_w": 0.42594896331738435,
"calib/step_q_w_n": 627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2985.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 488.55078125,
"completions/mean_terminated_length": 490.4667053222656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.04189695045351982,
"kl": 0.0657196044921875,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0902,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03538179397583008,
"mask/share_reasoning": 0.8432852625846863,
"mask/share_step_conf": 0.11742669343948364,
"num_tokens": 22703247.0,
"reward": 0.8022552728652954,
"reward_std": 0.1326061487197876,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.760992169380188,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8435183763504028,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5955591201782227,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7577732801437378,
"adv/std_final_conf": 0.8261606097221375,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340616464614868,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7149068322981367,
"calib/avg_num_step_conf": 5.26953125,
"calib/ece": 0.20290836653386435,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6414342629482072,
"calib/gap": 0.2563685300207039,
"calib/mean_conf": 0.7457768924302791,
"calib/mu_c": 0.8377018633540373,
"calib/mu_w": 0.5813333333333334,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15362549800796793,
"calib/std_conf": 0.3445322685722691,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5504002306805075,
"calib/step_q_c_n": 867.0,
"calib/step_q_gap": 0.07861599831536226,
"calib/step_q_w": 0.4717842323651452,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 503.22265625,
"completions/mean_terminated_length": 503.22265625,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.0395638570189476,
"kl": 0.06453323364257812,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0433,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035683706402778625,
"mask/share_reasoning": 0.8422435522079468,
"mask/share_step_conf": 0.122072733938694,
"num_tokens": 22938200.0,
"reward": 0.7931810617446899,
"reward_std": 0.1584271341562271,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7435300946235657,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.842832088470459,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.5166634321212769,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7450376749038696,
"adv/std_final_conf": 0.7603433728218079,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9338791370391846,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7806013431013431,
"calib/avg_num_step_conf": 4.671875,
"calib/ece": 0.14169291338582685,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7401574803149606,
"calib/gap": 0.3162042124542125,
"calib/mean_conf": 0.8294881889763781,
"calib/mu_c": 0.9191208791208791,
"calib/mu_w": 0.6029166666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12732283464566937,
"calib/std_conf": 0.28928117172950024,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5911166253101737,
"calib/step_q_c_n": 806.0,
"calib/step_q_gap": 0.11347559966914811,
"calib/step_q_w": 0.4776410256410256,
"calib/step_q_w_n": 390.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 435.5859375,
"completions/mean_terminated_length": 435.5859375,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1024,
"grad_norm": 0.07836330682039261,
"kl": 0.0711212158203125,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0184,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.039429161697626114,
"mask/share_reasoning": 0.8440896272659302,
"mask/share_step_conf": 0.11648118495941162,
"num_tokens": 23155526.0,
"reward": 0.8367966413497925,
"reward_std": 0.1331299990415573,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.8224198818206787,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8511732816696167,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.5952975153923035,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7472293376922607,
"adv/std_final_conf": 0.8256112337112427,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934228241443634,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6639682539682541,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.2432156862745097,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6823529411764706,
"calib/gap": 0.20131428571428567,
"calib/mean_conf": 0.8073725490196081,
"calib/mu_c": 0.8902666666666667,
"calib/mu_w": 0.688952380952381,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2311764705882352,
"calib/std_conf": 0.2928610699722091,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6027027027027027,
"calib/step_q_c_n": 703.0,
"calib/step_q_gap": 0.10918360548048056,
"calib/step_q_w": 0.4935190972222221,
"calib/step_q_w_n": 576.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 433.79296875,
"completions/mean_terminated_length": 433.79296875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.05005478858947754,
"kl": 0.06501007080078125,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.038694001734256744,
"mask/share_reasoning": 0.8357928395271301,
"mask/share_step_conf": 0.12551318109035492,
"num_tokens": 23371649.0,
"reward": 0.7806545495986938,
"reward_std": 0.1640351563692093,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7153007388114929,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.84600830078125,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.6123366951942444,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7438055872917175,
"adv/std_final_conf": 0.8297456502914429,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343807697296143,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6935420743639922,
"calib/avg_num_step_conf": 4.48046875,
"calib/ece": 0.26298804780876495,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7609561752988048,
"calib/gap": 0.2340821917808219,
"calib/mean_conf": 0.8301593625498008,
"calib/mu_c": 0.928082191780822,
"calib/mu_w": 0.6940000000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2557370517928287,
"calib/std_conf": 0.2952518920902296,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6388505747126437,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.13706618809554338,
"calib/step_q_w": 0.5017843866171003,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2519.0,
"completions/max_terminated_length": 2519.0,
"completions/mean_length": 464.91796875,
"completions/mean_terminated_length": 468.5787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.04752293601632118,
"kl": 0.059108734130859375,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0004,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03807282820343971,
"mask/share_reasoning": 0.8472254276275635,
"mask/share_step_conf": 0.10688920319080353,
"num_tokens": 23596852.0,
"reward": 0.7616933584213257,
"reward_std": 0.18782877922058105,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7075746059417725,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8158121109008789,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6620317697525024,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7611058354377747,
"adv/std_final_conf": 0.8592643737792969,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352956414222717,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.735962294859623,
"calib/avg_num_step_conf": 4.8515625,
"calib/ece": 0.32655870445344126,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5668016194331984,
"calib/gap": 0.2586864234368639,
"calib/mean_conf": 0.7210526315789474,
"calib/mu_c": 0.8739603960396037,
"calib/mu_w": 0.6152739726027399,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3193522267206478,
"calib/std_conf": 0.33710800486864156,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5883026584867076,
"calib/step_q_c_n": 489.0,
"calib/step_q_gap": 0.08741554029281651,
"calib/step_q_w": 0.5008871181938911,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2383.0,
"completions/max_terminated_length": 2383.0,
"completions/mean_length": 528.14453125,
"completions/mean_terminated_length": 534.4071655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1056,
"grad_norm": 0.0692669078707695,
"kl": 0.049961090087890625,
"learning_rate": 2.805555555555556e-06,
"loss": -0.1378,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03356289118528366,
"mask/share_reasoning": 0.8508045673370361,
"mask/share_step_conf": 0.10391384363174438,
"num_tokens": 23837857.0,
"reward": 0.7011371850967407,
"reward_std": 0.22175775468349457,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.6394370794296265,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.762837290763855,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.6037321090698242,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7761998176574707,
"adv/std_final_conf": 0.8127713203430176,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341810345649719,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7539454347964987,
"calib/avg_num_step_conf": 4.8359375,
"calib/ece": 0.22964285714285715,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6746031746031746,
"calib/gap": 0.31578876749089524,
"calib/mean_conf": 0.7761507936507938,
"calib/mu_c": 0.9152482269503547,
"calib/mu_w": 0.5994594594594594,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22313492063492063,
"calib/std_conf": 0.3298169662548136,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6311295180722891,
"calib/step_q_c_n": 664.0,
"calib/step_q_gap": 0.12743613828134837,
"calib/step_q_w": 0.5036933797909408,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2863.0,
"completions/max_terminated_length": 2863.0,
"completions/mean_length": 501.0625,
"completions/mean_terminated_length": 501.0625,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.06930825859308243,
"kl": 0.053985595703125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0075,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035849228501319885,
"mask/share_reasoning": 0.853874921798706,
"mask/share_step_conf": 0.11027580499649048,
"num_tokens": 24073537.0,
"reward": 0.7880070805549622,
"reward_std": 0.18869704008102417,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7400109171867371,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8360031843185425,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.6109015941619873,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7448518872261047,
"adv/std_final_conf": 0.8460630178451538,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343430399894714,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7195292620865139,
"calib/avg_num_step_conf": 4.56640625,
"calib/ece": 0.189402390438247,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5139442231075697,
"calib/gap": 0.2612124681933843,
"calib/mean_conf": 0.6891633466135458,
"calib/mu_c": 0.8140458015267177,
"calib/mu_w": 0.5528333333333334,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1783266932270916,
"calib/std_conf": 0.3360487793205386,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5690834697217676,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.08410139086872098,
"calib/step_q_w": 0.4849820788530466,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2261.0,
"completions/max_terminated_length": 2261.0,
"completions/mean_length": 472.94921875,
"completions/mean_terminated_length": 474.803955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.057462096214294434,
"kl": 0.0570526123046875,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0116,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03623095527291298,
"mask/share_reasoning": 0.8547747135162354,
"mask/share_step_conf": 0.10508811473846436,
"num_tokens": 24301604.0,
"reward": 0.755883514881134,
"reward_std": 0.16906806826591492,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7198277711868286,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7919393181800842,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6098600625991821,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7454801797866821,
"adv/std_final_conf": 0.8268715143203735,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9331259727478027,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8043790849673204,
"calib/avg_num_step_conf": 4.5859375,
"calib/ece": 0.1588537549407114,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5810276679841897,
"calib/gap": 0.3814215686274509,
"calib/mean_conf": 0.7331620553359685,
"calib/mu_c": 0.8839215686274511,
"calib/mu_w": 0.5025000000000002,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1436363636363636,
"calib/std_conf": 0.3366216155834159,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6155306451612903,
"calib/step_q_c_n": 620.0,
"calib/step_q_gap": 0.15861909281471986,
"calib/step_q_w": 0.45691155234657044,
"calib/step_q_w_n": 554.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2521.0,
"completions/max_terminated_length": 2521.0,
"completions/mean_length": 417.44140625,
"completions/mean_terminated_length": 419.0784606933594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.1088,
"grad_norm": 0.055821649730205536,
"kl": 0.065460205078125,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0168,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04316910356283188,
"mask/share_reasoning": 0.8308782577514648,
"mask/share_step_conf": 0.12204640358686447,
"num_tokens": 24515165.0,
"reward": 0.8302434086799622,
"reward_std": 0.1441684067249298,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.8039737939834595,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8565130233764648,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.47849929332733154,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7427432537078857,
"adv/std_final_conf": 0.7303540110588074,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9331086277961731,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7756747860434497,
"calib/avg_num_step_conf": 4.25,
"calib/ece": 0.16671936758893272,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5849802371541502,
"calib/gap": 0.29099868334430545,
"calib/mean_conf": 0.7420553359683795,
"calib/mu_c": 0.8547741935483871,
"calib/mu_w": 0.5637755102040817,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.14806324110671928,
"calib/std_conf": 0.32512892684644035,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6111454823889739,
"calib/step_q_c_n": 653.0,
"calib/step_q_gap": 0.10831329848092786,
"calib/step_q_w": 0.5028321839080461,
"calib/step_q_w_n": 435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2081.0,
"completions/max_terminated_length": 2081.0,
"completions/mean_length": 490.5390625,
"completions/mean_terminated_length": 490.5390625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.05225846916437149,
"kl": 0.05483245849609375,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0374,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036248303949832916,
"mask/share_reasoning": 0.8617591857910156,
"mask/share_step_conf": 0.10199250280857086,
"num_tokens": 24745295.0,
"reward": 0.804502010345459,
"reward_std": 0.13372300565242767,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7684851288795471,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8405188322067261,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6716189384460449,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7436313629150391,
"adv/std_final_conf": 0.8857026100158691,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934344470500946,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7777537930183792,
"calib/avg_num_step_conf": 5.0234375,
"calib/ece": 0.15611764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.40784313725490196,
"calib/gap": 0.36534353028247196,
"calib/mean_conf": 0.6069411764705882,
"calib/mu_c": 0.7989256198347108,
"calib/mu_w": 0.4335820895522388,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14427450980392154,
"calib/std_conf": 0.3562374957841152,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5718568994889267,
"calib/step_q_c_n": 587.0,
"calib/step_q_gap": 0.1134920926219739,
"calib/step_q_w": 0.4583648068669528,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1129.0,
"completions/max_terminated_length": 1129.0,
"completions/mean_length": 458.58984375,
"completions/mean_terminated_length": 460.3882751464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.06820174306631088,
"kl": 0.0662841796875,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0621,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03526720404624939,
"mask/share_reasoning": 0.8457778692245483,
"mask/share_step_conf": 0.11504866182804108,
"num_tokens": 24969374.0,
"reward": 0.8067743182182312,
"reward_std": 0.14951029419898987,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.781417965888977,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8321306705474854,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7189830541610718,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7476561665534973,
"adv/std_final_conf": 0.9116973876953125,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934876561164856,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7085136619081455,
"calib/avg_num_step_conf": 4.19140625,
"calib/ece": 0.19212,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.444,
"calib/gap": 0.2806310961824172,
"calib/mean_conf": 0.6106,
"calib/mu_c": 0.7374452554744526,
"calib/mu_w": 0.4568141592920354,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12736000000000003,
"calib/std_conf": 0.3685941399425661,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5463205828779599,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.1413396668474256,
"calib/step_q_w": 0.4049809160305343,
"calib/step_q_w_n": 524.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 463.27734375,
"completions/mean_terminated_length": 468.7707824707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.112,
"grad_norm": 0.058943260461091995,
"kl": 0.063934326171875,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.073,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03884696960449219,
"mask/share_reasoning": 0.8451870679855347,
"mask/share_step_conf": 0.10424716770648956,
"num_tokens": 25193733.0,
"reward": 0.7750270962715149,
"reward_std": 0.19859516620635986,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7339316606521606,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8161225914955139,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6058851480484009,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7565438151359558,
"adv/std_final_conf": 0.8274356126785278,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341694116592407,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.756379062948406,
"calib/avg_num_step_conf": 4.70703125,
"calib/ece": 0.1823070866141732,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.531496062992126,
"calib/gap": 0.3437558175806351,
"calib/mean_conf": 0.666984251968504,
"calib/mu_c": 0.8253284671532847,
"calib/mu_w": 0.48157264957264956,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1549606299212598,
"calib/std_conf": 0.36867525099588855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.555424292845258,
"calib/step_q_c_n": 601.0,
"calib/step_q_gap": 0.1408630345671123,
"calib/step_q_w": 0.41456125827814566,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2385.0,
"completions/max_terminated_length": 2385.0,
"completions/mean_length": 456.69921875,
"completions/mean_terminated_length": 456.69921875,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.09702730923891068,
"kl": 0.1219024658203125,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0079,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036941565573215485,
"mask/share_reasoning": 0.8542109727859497,
"mask/share_step_conf": 0.1088474690914154,
"num_tokens": 25415232.0,
"reward": 0.8071750402450562,
"reward_std": 0.15165673196315765,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7641385793685913,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.850211501121521,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.5851588249206543,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7315694689750671,
"adv/std_final_conf": 0.8237432837486267,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343537092208862,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7436145510835913,
"calib/avg_num_step_conf": 4.90234375,
"calib/ece": 0.17437007874015742,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.515748031496063,
"calib/gap": 0.3058230134158927,
"calib/mean_conf": 0.6931102362204725,
"calib/mu_c": 0.815921052631579,
"calib/mu_w": 0.5100980392156863,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13452755905511804,
"calib/std_conf": 0.3450290579648353,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5534992223950232,
"calib/step_q_c_n": 643.0,
"calib/step_q_gap": 0.15557438579371607,
"calib/step_q_w": 0.3979248366013072,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2708.0,
"completions/max_terminated_length": 2708.0,
"completions/mean_length": 463.87890625,
"completions/mean_terminated_length": 463.87890625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.05170339345932007,
"kl": 0.06480026245117188,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0818,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03820245712995529,
"mask/share_reasoning": 0.848260760307312,
"mask/share_step_conf": 0.11353675276041031,
"num_tokens": 25638601.0,
"reward": 0.8206563591957092,
"reward_std": 0.14223712682724,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7725800275802612,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.868732750415802,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.5855361819267273,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7494146823883057,
"adv/std_final_conf": 0.8314166069030762,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9335763454437256,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.740392796369824,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.15551181102362197,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6377952755905512,
"calib/gap": 0.3404169030062393,
"calib/mean_conf": 0.7590551181102363,
"calib/mu_c": 0.8689534883720929,
"calib/mu_w": 0.5285365853658536,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11870078740157472,
"calib/std_conf": 0.3434870582230404,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5365839909808343,
"calib/step_q_c_n": 887.0,
"calib/step_q_gap": 0.08348328840472191,
"calib/step_q_w": 0.4531007025761124,
"calib/step_q_w_n": 427.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1989.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 493.80859375,
"completions/mean_terminated_length": 495.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.1152,
"grad_norm": 0.04973267391324043,
"kl": 0.071014404296875,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0653,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03569352626800537,
"mask/share_reasoning": 0.8444836139678955,
"mask/share_step_conf": 0.11591663956642151,
"num_tokens": 25868248.0,
"reward": 0.8126301765441895,
"reward_std": 0.1515788584947586,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7874480485916138,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8378124237060547,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.5870604515075684,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7535527944564819,
"adv/std_final_conf": 0.8096221685409546,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340676069259644,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8179909560723514,
"calib/avg_num_step_conf": 4.98046875,
"calib/ece": 0.13384738955823292,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.46987951807228917,
"calib/gap": 0.467429069767442,
"calib/mean_conf": 0.5943453815261045,
"calib/mu_c": 0.8196124031007753,
"calib/mu_w": 0.35218333333333335,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10506024096385541,
"calib/std_conf": 0.3991824088947798,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5310361067503925,
"calib/step_q_c_n": 637.0,
"calib/step_q_gap": 0.20546243903879374,
"calib/step_q_w": 0.32557366771159874,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2402.0,
"completions/max_terminated_length": 2402.0,
"completions/mean_length": 496.203125,
"completions/mean_terminated_length": 500.1102294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.045122113078832626,
"kl": 0.06939697265625,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0858,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035198867321014404,
"mask/share_reasoning": 0.8491263389587402,
"mask/share_step_conf": 0.10786230862140656,
"num_tokens": 26099876.0,
"reward": 0.8194234371185303,
"reward_std": 0.16372306644916534,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7931814193725586,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8456655144691467,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7107264995574951,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7442977428436279,
"adv/std_final_conf": 0.8976032733917236,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342363476753235,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6777255639097743,
"calib/avg_num_step_conf": 3.9921875,
"calib/ece": 0.24999999999999994,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5770750988142292,
"calib/gap": 0.28871303258145364,
"calib/mean_conf": 0.6849407114624507,
"calib/mu_c": 0.8218796992481203,
"calib/mu_w": 0.5331666666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20462450592885373,
"calib/std_conf": 0.3900842318619402,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5682617187500001,
"calib/step_q_c_n": 512.0,
"calib/step_q_gap": 0.13175191482843146,
"calib/step_q_w": 0.43650980392156863,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2629.0,
"completions/max_terminated_length": 2629.0,
"completions/mean_length": 431.84375,
"completions/mean_terminated_length": 431.84375,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.06673236936330795,
"kl": 0.06946563720703125,
"learning_rate": 2.5e-06,
"loss": -0.0362,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03971089422702789,
"mask/share_reasoning": 0.8535101413726807,
"mask/share_step_conf": 0.10677894949913025,
"num_tokens": 26315348.0,
"reward": 0.7538522481918335,
"reward_std": 0.19147515296936035,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.704800009727478,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8029043674468994,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.5409973859786987,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7609439492225647,
"adv/std_final_conf": 0.7927687764167786,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348319172859192,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7098196120981961,
"calib/avg_num_step_conf": 3.90625,
"calib/ece": 0.22963562753036426,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.611336032388664,
"calib/gap": 0.2984239793842397,
"calib/mean_conf": 0.7072874493927125,
"calib/mu_c": 0.8293150684931507,
"calib/mu_w": 0.530891089108911,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17291497975708492,
"calib/std_conf": 0.37932970175825215,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5571863799283153,
"calib/step_q_c_n": 558.0,
"calib/step_q_gap": 0.16211850662514787,
"calib/step_q_w": 0.39506787330316745,
"calib/step_q_w_n": 442.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 443.15234375,
"completions/mean_terminated_length": 444.8902282714844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1184,
"grad_norm": 0.04155619442462921,
"kl": 0.06301116943359375,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0501,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.04270726814866066,
"mask/share_reasoning": 0.847565770149231,
"mask/share_step_conf": 0.1058206781744957,
"num_tokens": 26536203.0,
"reward": 0.7658801674842834,
"reward_std": 0.16763192415237427,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7185871005058289,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8131731748580933,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6597450971603394,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7538222074508667,
"adv/std_final_conf": 0.8582755327224731,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354592561721802,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7894385496451408,
"calib/avg_num_step_conf": 4.04296875,
"calib/ece": 0.17149999999999999,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.4375,
"calib/gap": 0.4543096057901766,
"calib/mean_conf": 0.5293333333333334,
"calib/mu_c": 0.7318796992481205,
"calib/mu_w": 0.27757009345794387,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.0733333333333333,
"calib/std_conf": 0.42730206593878706,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5136745886654479,
"calib/step_q_c_n": 547.0,
"calib/step_q_gap": 0.21873606407528395,
"calib/step_q_w": 0.29493852459016395,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 546.83203125,
"completions/mean_terminated_length": 548.9765014648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.06552662700414658,
"kl": 0.06233978271484375,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0972,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.0337638258934021,
"mask/share_reasoning": 0.8707628846168518,
"mask/share_step_conf": 0.0915670171380043,
"num_tokens": 26784112.0,
"reward": 0.7577349543571472,
"reward_std": 0.21430005133152008,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7407132387161255,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7747566103935242,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.660697340965271,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7497421503067017,
"adv/std_final_conf": 0.8753498196601868,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342508912086487,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7964874691598495,
"calib/avg_num_step_conf": 4.4296875,
"calib/ece": 0.18213438735177867,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.34782608695652173,
"calib/gap": 0.4065037008180757,
"calib/mean_conf": 0.4911462450592886,
"calib/mu_c": 0.6550331125827816,
"calib/mu_w": 0.24852941176470594,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.03822134387351779,
"calib/std_conf": 0.3982390466744529,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4483663220088626,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.12703152988632432,
"calib/step_q_w": 0.3213347921225383,
"calib/step_q_w_n": 457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1671.0,
"completions/max_terminated_length": 1671.0,
"completions/mean_length": 409.04296875,
"completions/mean_terminated_length": 410.6470947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.04978412389755249,
"kl": 0.09747314453125,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.1001,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04079805687069893,
"mask/share_reasoning": 0.8408189415931702,
"mask/share_step_conf": 0.11447672545909882,
"num_tokens": 26994027.0,
"reward": 0.8243892192840576,
"reward_std": 0.14668944478034973,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7760382890701294,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8727402687072754,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6948554515838623,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7704617977142334,
"adv/std_final_conf": 0.8857570290565491,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337559938430786,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7926232993197279,
"calib/avg_num_step_conf": 4.3671875,
"calib/ece": 0.16404761904761905,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.47875,
"calib/mean_conf": 0.6176190476190476,
"calib/mu_c": 0.7772023809523809,
"calib/mu_w": 0.2984523809523809,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.057499999999999996,
"calib/std_conf": 0.41437068094218465,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.48268125854993166,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.23211278309773525,
"calib/step_q_w": 0.2505684754521964,
"calib/step_q_w_n": 387.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1833.0,
"completions/max_terminated_length": 1833.0,
"completions/mean_length": 427.68359375,
"completions/mean_terminated_length": 429.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1216,
"grad_norm": 0.07593639940023422,
"kl": 0.0800933837890625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0511,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.042328353971242905,
"mask/share_reasoning": 0.8368827104568481,
"mask/share_step_conf": 0.11688268184661865,
"num_tokens": 27208538.0,
"reward": 0.834463894367218,
"reward_std": 0.16004055738449097,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7997835874557495,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8691442012786865,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.7465718984603882,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7727314233779907,
"adv/std_final_conf": 0.9083997011184692,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353749752044678,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7171296889002688,
"calib/avg_num_step_conf": 4.15234375,
"calib/ece": 0.23284584980237144,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3952569169960474,
"calib/gap": 0.34606324414287537,
"calib/mean_conf": 0.4921343873517787,
"calib/mu_c": 0.6384931506849315,
"calib/mu_w": 0.29242990654205614,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07395256916996037,
"calib/std_conf": 0.42591008083843535,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4040268987341772,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": 0.07579255998707746,
"calib/step_q_w": 0.32823433874709973,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2273.0,
"completions/max_terminated_length": 2273.0,
"completions/mean_length": 400.41796875,
"completions/mean_terminated_length": 400.41796875,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.06251219660043716,
"kl": 0.09407806396484375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.105,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.040772609412670135,
"mask/share_reasoning": 0.8457399606704712,
"mask/share_step_conf": 0.11348745226860046,
"num_tokens": 27416309.0,
"reward": 0.7573823928833008,
"reward_std": 0.22537976503372192,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7236734628677368,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7910914421081543,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.6926306486129761,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7407446503639221,
"adv/std_final_conf": 0.88161700963974,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343625903129578,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7317920918367348,
"calib/avg_num_step_conf": 4.44140625,
"calib/ece": 0.23539682539682544,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.40476190476190477,
"calib/gap": 0.36535714285714294,
"calib/mean_conf": 0.5276190476190477,
"calib/mu_c": 0.6900000000000001,
"calib/mu_w": 0.3246428571428571,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10373015873015878,
"calib/std_conf": 0.4323705945895947,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4250610820244328,
"calib/step_q_c_n": 573.0,
"calib/step_q_gap": 0.1750078905350711,
"calib/step_q_w": 0.2500531914893617,
"calib/step_q_w_n": 564.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2619.0,
"completions/max_terminated_length": 2619.0,
"completions/mean_length": 470.4765625,
"completions/mean_terminated_length": 470.4765625,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.07935766875743866,
"kl": 0.0879974365234375,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0241,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03920193389058113,
"mask/share_reasoning": 0.8547663688659668,
"mask/share_step_conf": 0.10603173077106476,
"num_tokens": 27641271.0,
"reward": 0.7947357296943665,
"reward_std": 0.1647808700799942,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7312819957733154,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8581894040107727,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.678554892539978,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7296802997589111,
"adv/std_final_conf": 0.8839506506919861,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351888298988342,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7323589648110221,
"calib/avg_num_step_conf": 4.30078125,
"calib/ece": 0.2168070866141732,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.468503937007874,
"calib/gap": 0.3472533358158008,
"calib/mean_conf": 0.5983031496062992,
"calib/mu_c": 0.7773983739837398,
"calib/mu_w": 0.430145038167939,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16542913385826766,
"calib/std_conf": 0.4125910525014487,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47146185567010307,
"calib/step_q_c_n": 485.0,
"calib/step_q_gap": 0.13851542709867448,
"calib/step_q_w": 0.3329464285714286,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2298.0,
"completions/max_terminated_length": 2298.0,
"completions/mean_length": 452.59765625,
"completions/mean_terminated_length": 452.59765625,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1248,
"grad_norm": 0.04363831505179405,
"kl": 0.0821533203125,
"learning_rate": 2.305555555555556e-06,
"loss": -0.1175,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0384393110871315,
"mask/share_reasoning": 0.8544684648513794,
"mask/share_step_conf": 0.10709226131439209,
"num_tokens": 27863736.0,
"reward": 0.7832147479057312,
"reward_std": 0.18473175168037415,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7263078093528748,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8401218056678772,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.5879744291305542,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7720198035240173,
"adv/std_final_conf": 0.7948587536811829,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346683621406555,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7168510084580351,
"calib/avg_num_step_conf": 4.984375,
"calib/ece": 0.26019920318725087,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6254980079681275,
"calib/gap": 0.28801821730644106,
"calib/mean_conf": 0.726573705179283,
"calib/mu_c": 0.8482068965517241,
"calib/mu_w": 0.560188679245283,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.20454183266932263,
"calib/std_conf": 0.3862087321728891,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4322253315758167,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.11845049713873068,
"calib/step_q_w": 0.31377483443708604,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 485.765625,
"completions/mean_terminated_length": 487.6706237792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.037945739924907684,
"kl": 0.0703125,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0543,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.036695875227451324,
"mask/share_reasoning": 0.845686674118042,
"mask/share_step_conf": 0.1137111485004425,
"num_tokens": 28092100.0,
"reward": 0.7686008810997009,
"reward_std": 0.19075638055801392,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7060140371322632,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8311876058578491,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.6270201802253723,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7426635026931763,
"adv/std_final_conf": 0.8438059687614441,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.929425835609436,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7711167352103765,
"calib/avg_num_step_conf": 4.48046875,
"calib/ece": 0.21615748031496046,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6141732283464567,
"calib/gap": 0.40604340398608035,
"calib/mean_conf": 0.6786771653543308,
"calib/mu_c": 0.8529241379310345,
"calib/mu_w": 0.44688073394495414,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16198425196850375,
"calib/std_conf": 0.41690956896091674,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.504838953488372,
"calib/step_q_c_n": 516.0,
"calib/step_q_gap": 0.27353942892418825,
"calib/step_q_w": 0.23129952456418382,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2586.0,
"completions/max_terminated_length": 2586.0,
"completions/mean_length": 502.93359375,
"completions/mean_terminated_length": 502.93359375,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.05070538446307182,
"kl": 0.07465362548828125,
"learning_rate": 2.25e-06,
"loss": -0.0048,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036642082035541534,
"mask/share_reasoning": 0.8676231503486633,
"mask/share_step_conf": 0.09573476016521454,
"num_tokens": 28325915.0,
"reward": 0.8076252937316895,
"reward_std": 0.20259562134742737,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7510377764701843,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8642127513885498,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.5917130708694458,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7489132881164551,
"adv/std_final_conf": 0.8114206790924072,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350486993789673,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7984442237669647,
"calib/avg_num_step_conf": 4.14453125,
"calib/ece": 0.1860236220472442,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6299212598425197,
"calib/gap": 0.48683813306852053,
"calib/mean_conf": 0.6788582677165355,
"calib/mu_c": 0.8609433962264154,
"calib/mu_w": 0.37410526315789483,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11944881889763792,
"calib/std_conf": 0.4254833698538593,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.44790935672514615,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.12578734081002674,
"calib/step_q_w": 0.3221220159151194,
"calib/step_q_w_n": 377.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2058.0,
"completions/max_terminated_length": 2058.0,
"completions/mean_length": 414.00390625,
"completions/mean_terminated_length": 414.00390625,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.128,
"grad_norm": 0.04255034402012825,
"kl": 0.0883026123046875,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0381,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04009437561035156,
"mask/share_reasoning": 0.8499317765235901,
"mask/share_step_conf": 0.10997384041547775,
"num_tokens": 28538588.0,
"reward": 0.8278021812438965,
"reward_std": 0.17815116047859192,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.803676962852478,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8519275188446045,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.5864112377166748,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7474852800369263,
"adv/std_final_conf": 0.8054826259613037,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345322847366333,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6949251789199737,
"calib/avg_num_step_conf": 5.1015625,
"calib/ece": 0.27322310756972107,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7330677290836654,
"calib/gap": 0.2780366948601172,
"calib/mean_conf": 0.8007131474103586,
"calib/mu_c": 0.9181310344827588,
"calib/mu_w": 0.6400943396226416,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2481235059760956,
"calib/std_conf": 0.3492089714818343,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4264383808095952,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.09734604904120703,
"calib/step_q_w": 0.32909233176838815,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 505.58203125,
"completions/mean_terminated_length": 507.5647277832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.04652104154229164,
"kl": 0.06574249267578125,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0736,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035734884440898895,
"mask/share_reasoning": 0.8535595536231995,
"mask/share_step_conf": 0.10679930448532104,
"num_tokens": 28773073.0,
"reward": 0.7692856788635254,
"reward_std": 0.19570022821426392,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7019648551940918,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.836606502532959,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.5605442523956299,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7494107484817505,
"adv/std_final_conf": 0.7812092304229736,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340590834617615,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7425825593395254,
"calib/avg_num_step_conf": 4.8125,
"calib/ece": 0.22480314960629924,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7440944881889764,
"calib/gap": 0.3767298761609905,
"calib/mean_conf": 0.7833858267716536,
"calib/mu_c": 0.9346710526315789,
"calib/mu_w": 0.5579411764705884,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20488188976377955,
"calib/std_conf": 0.37253699382426914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.578234328358209,
"calib/step_q_c_n": 670.0,
"calib/step_q_gap": 0.27546564508418764,
"calib/step_q_w": 0.30276868327402134,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2436.0,
"completions/max_terminated_length": 2436.0,
"completions/mean_length": 451.31640625,
"completions/mean_terminated_length": 451.31640625,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.041189853101968765,
"kl": 0.07921600341796875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0203,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03811654448509216,
"mask/share_reasoning": 0.8494611978530884,
"mask/share_step_conf": 0.11242222785949707,
"num_tokens": 28995954.0,
"reward": 0.8048402667045593,
"reward_std": 0.17405804991722107,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.761760950088501,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8479195833206177,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.6796097755432129,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7547227740287781,
"adv/std_final_conf": 0.8744975924491882,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348601698875427,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6908712343028174,
"calib/avg_num_step_conf": 5.16015625,
"calib/ece": 0.2880400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.2701782809551696,
"calib/mean_conf": 0.7597200000000001,
"calib/mu_c": 0.8775177304964539,
"calib/mu_w": 0.6073394495412843,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.24188000000000004,
"calib/std_conf": 0.38318601435856187,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.40191558441558445,
"calib/step_q_c_n": 616.0,
"calib/step_q_gap": 0.1488362936354426,
"calib/step_q_w": 0.25307929078014185,
"calib/step_q_w_n": 705.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 541.19140625,
"completions/mean_terminated_length": 543.3137817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1312,
"grad_norm": 0.04248799383640289,
"kl": 0.0621795654296875,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0327,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.034889135509729385,
"mask/share_reasoning": 0.8600262999534607,
"mask/share_step_conf": 0.10117833316326141,
"num_tokens": 29239787.0,
"reward": 0.7408413887023926,
"reward_std": 0.22834190726280212,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6735906600952148,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8080922365188599,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.5587438344955444,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7677050828933716,
"adv/std_final_conf": 0.8090704679489136,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350267052650452,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6976533690915186,
"calib/avg_num_step_conf": 4.6171875,
"calib/ece": 0.2434444444444443,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.746031746031746,
"calib/gap": 0.2915293328863561,
"calib/mean_conf": 0.7932063492063494,
"calib/mu_c": 0.9031082802547772,
"calib/mu_w": 0.6115789473684211,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20681746031746015,
"calib/std_conf": 0.3628922249701894,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5021832386363637,
"calib/step_q_c_n": 704.0,
"calib/step_q_gap": 0.1642752888455688,
"calib/step_q_w": 0.3379079497907949,
"calib/step_q_w_n": 478.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2170.0,
"completions/max_terminated_length": 2170.0,
"completions/mean_length": 486.1328125,
"completions/mean_terminated_length": 486.1328125,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.06367672979831696,
"kl": 0.0710906982421875,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0453,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03547365963459015,
"mask/share_reasoning": 0.8601535558700562,
"mask/share_step_conf": 0.10437280684709549,
"num_tokens": 29471053.0,
"reward": 0.7744747996330261,
"reward_std": 0.180791974067688,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7260277271270752,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.822921872138977,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.6150028109550476,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7723146080970764,
"adv/std_final_conf": 0.8241979479789734,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339441061019897,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6313887793982393,
"calib/avg_num_step_conf": 4.73046875,
"calib/ece": 0.35696356275303653,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7206477732793523,
"calib/gap": 0.17255551175929595,
"calib/mean_conf": 0.7746963562753035,
"calib/mu_c": 0.8571317829457364,
"calib/mu_w": 0.6845762711864405,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3046963562753037,
"calib/std_conf": 0.3729559797901317,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5079631782945737,
"calib/step_q_c_n": 516.0,
"calib/step_q_gap": 0.1762365595895377,
"calib/step_q_w": 0.331726618705036,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2824.0,
"completions/max_terminated_length": 2824.0,
"completions/mean_length": 523.515625,
"completions/mean_terminated_length": 525.5686645507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.04430394247174263,
"kl": 0.06725311279296875,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0845,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03517274558544159,
"mask/share_reasoning": 0.8596398830413818,
"mask/share_step_conf": 0.10128115117549896,
"num_tokens": 29709881.0,
"reward": 0.7005262970924377,
"reward_std": 0.18713583052158356,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.611504316329956,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7895482778549194,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5662118196487427,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7217642068862915,
"adv/std_final_conf": 0.7941136956214905,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9217687249183655,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7514391583405016,
"calib/avg_num_step_conf": 4.63671875,
"calib/ece": 0.2498979674796747,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6707317073170732,
"calib/gap": 0.40203121153973354,
"calib/mean_conf": 0.7146947154471545,
"calib/mu_c": 0.9091732283464564,
"calib/mu_w": 0.5071420168067229,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.22416626016260152,
"calib/std_conf": 0.4150856757635778,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5724484052532833,
"calib/step_q_c_n": 533.0,
"calib/step_q_gap": 0.3069042156508369,
"calib/step_q_w": 0.2655441896024464,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2757.0,
"completions/max_terminated_length": 2757.0,
"completions/mean_length": 493.0703125,
"completions/mean_terminated_length": 498.9170227050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.1344,
"grad_norm": 0.04063578322529793,
"kl": 0.08344650268554688,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.1481,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03813691437244415,
"mask/share_reasoning": 0.8446433544158936,
"mask/share_step_conf": 0.10550101101398468,
"num_tokens": 29941571.0,
"reward": 0.7483680844306946,
"reward_std": 0.21162956953048706,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6991752982139587,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7975609302520752,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.5917777419090271,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7527652978897095,
"adv/std_final_conf": 0.7995696067810059,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351709485054016,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7078985602958658,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.27238866396761124,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.659919028340081,
"calib/gap": 0.3514172500330208,
"calib/mean_conf": 0.6968421052631579,
"calib/mu_c": 0.8576119402985076,
"calib/mu_w": 0.5061946902654868,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21336032388663959,
"calib/std_conf": 0.42501410713479154,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5186550632911392,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": 0.22045269992334005,
"calib/step_q_w": 0.29820236336779915,
"calib/step_q_w_n": 677.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2662.0,
"completions/max_terminated_length": 2662.0,
"completions/mean_length": 524.87890625,
"completions/mean_terminated_length": 524.87890625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.05239735171198845,
"kl": 0.06211090087890625,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0059,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.037127815186977386,
"mask/share_reasoning": 0.8489212989807129,
"mask/share_step_conf": 0.1139509379863739,
"num_tokens": 30179612.0,
"reward": 0.7568548917770386,
"reward_std": 0.22605092823505402,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6964148283004761,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8172948360443115,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.6582461595535278,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.781390905380249,
"adv/std_final_conf": 0.8269768357276917,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352853894233704,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6791420515574651,
"calib/avg_num_step_conf": 4.265625,
"calib/ece": 0.31408163265306127,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7142857142857143,
"calib/gap": 0.258388157894737,
"calib/mean_conf": 0.7728571428571429,
"calib/mu_c": 0.8909774436090228,
"calib/mu_w": 0.6325892857142857,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.27204081632653065,
"calib/std_conf": 0.38220413393892005,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.56017578125,
"calib/step_q_c_n": 512.0,
"calib/step_q_gap": 0.24564302262931037,
"calib/step_q_w": 0.31453275862068963,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2670.0,
"completions/max_terminated_length": 2670.0,
"completions/mean_length": 490.71484375,
"completions/mean_terminated_length": 496.53363037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.06866651773452759,
"kl": 0.066314697265625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0601,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.04000090807676315,
"mask/share_reasoning": 0.8460492491722107,
"mask/share_step_conf": 0.10223108530044556,
"num_tokens": 30411899.0,
"reward": 0.7223763465881348,
"reward_std": 0.24244219064712524,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.647929310798645,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7968234419822693,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.6572911143302917,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7690467834472656,
"adv/std_final_conf": 0.8524337410926819,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346084594726562,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6090235256203675,
"calib/avg_num_step_conf": 4.7265625,
"calib/ece": 0.3544801587301588,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7738095238095238,
"calib/gap": 0.15475494682565272,
"calib/mean_conf": 0.8045595238095239,
"calib/mu_c": 0.8702689655172414,
"calib/mu_w": 0.7155140186915887,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2918214285714287,
"calib/std_conf": 0.3597985663696813,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5545014492753623,
"calib/step_q_c_n": 690.0,
"calib/step_q_gap": 0.11338606465997764,
"calib/step_q_w": 0.44111538461538463,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2840.0,
"completions/max_terminated_length": 2840.0,
"completions/mean_length": 443.8984375,
"completions/mean_terminated_length": 443.8984375,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.1376,
"grad_norm": 0.0634075254201889,
"kl": 0.0722503662109375,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0209,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03835558146238327,
"mask/share_reasoning": 0.8439362049102783,
"mask/share_step_conf": 0.11770817637443542,
"num_tokens": 30627921.0,
"reward": 0.7297005653381348,
"reward_std": 0.20747110247612,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6390316486358643,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8203694820404053,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.5336794853210449,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7394791841506958,
"adv/std_final_conf": 0.7902102470397949,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340017437934875,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6969356486210418,
"calib/avg_num_step_conf": 4.05078125,
"calib/ece": 0.22712598425196848,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7362204724409449,
"calib/gap": 0.3584725910793326,
"calib/mean_conf": 0.7657874015748032,
"calib/mu_c": 0.8913939393939393,
"calib/mu_w": 0.5329213483146067,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17165354330708657,
"calib/std_conf": 0.39243331606505466,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5255920000000001,
"calib/step_q_c_n": 625.0,
"calib/step_q_gap": 0.10928132038834965,
"calib/step_q_w": 0.4163106796116504,
"calib/step_q_w_n": 412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1738.0,
"completions/max_terminated_length": 1738.0,
"completions/mean_length": 407.18359375,
"completions/mean_terminated_length": 408.7804260253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.04523677006363869,
"kl": 0.07286834716796875,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0415,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.042861148715019226,
"mask/share_reasoning": 0.8466463088989258,
"mask/share_step_conf": 0.10658632218837738,
"num_tokens": 30837448.0,
"reward": 0.7943114042282104,
"reward_std": 0.16882899403572083,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7463679313659668,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8422548770904541,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6024599671363831,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7664506435394287,
"adv/std_final_conf": 0.8044640421867371,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934328556060791,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7555746687054026,
"calib/avg_num_step_conf": 4.625,
"calib/ece": 0.2511462450592885,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5454545454545454,
"calib/gap": 0.4317214576962283,
"calib/mean_conf": 0.5933596837944665,
"calib/mu_c": 0.8390825688073394,
"calib/mu_w": 0.4073611111111112,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2068379446640316,
"calib/std_conf": 0.45306790959702653,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5259521829521829,
"calib/step_q_c_n": 481.0,
"calib/step_q_gap": 0.18824236787394683,
"calib/step_q_w": 0.3377098150782361,
"calib/step_q_w_n": 703.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2256.0,
"completions/max_terminated_length": 2256.0,
"completions/mean_length": 446.3671875,
"completions/mean_terminated_length": 446.3671875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.04830202832818031,
"kl": 0.07916259765625,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0219,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03808034956455231,
"mask/share_reasoning": 0.8501123189926147,
"mask/share_step_conf": 0.11180734634399414,
"num_tokens": 31057926.0,
"reward": 0.7807968258857727,
"reward_std": 0.1836085319519043,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.7262164354324341,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8353773355484009,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.532532811164856,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7684825658798218,
"adv/std_final_conf": 0.7594923377037048,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9334070682525635,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7674441654552663,
"calib/avg_num_step_conf": 4.6328125,
"calib/ece": 0.21380392156862743,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6588235294117647,
"calib/gap": 0.4566994846042024,
"calib/mean_conf": 0.6786666666666666,
"calib/mu_c": 0.8470186335403727,
"calib/mu_w": 0.39031914893617026,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13054901960784313,
"calib/std_conf": 0.438481992237241,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5852209469153515,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.2458119489603413,
"calib/step_q_w": 0.3394089979550102,
"calib/step_q_w_n": 489.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1687.0,
"completions/max_terminated_length": 1687.0,
"completions/mean_length": 450.15625,
"completions/mean_terminated_length": 451.9216003417969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.1408,
"grad_norm": 0.06343978643417358,
"kl": 0.071075439453125,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0479,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03952445834875107,
"mask/share_reasoning": 0.8402537107467651,
"mask/share_step_conf": 0.11631553620100021,
"num_tokens": 31278758.0,
"reward": 0.8163669109344482,
"reward_std": 0.17662331461906433,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7783671617507935,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8543666005134583,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.6812657117843628,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7408666610717773,
"adv/std_final_conf": 0.8782057166099548,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352087378501892,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7184826950914593,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.27525896414342643,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.38645418326693226,
"calib/gap": 0.37108501118568243,
"calib/mean_conf": 0.4380478087649402,
"calib/mu_c": 0.6583333333333334,
"calib/mu_w": 0.287248322147651,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1534661354581674,
"calib/std_conf": 0.4592274103833413,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43027338129496406,
"calib/step_q_c_n": 556.0,
"calib/step_q_gap": 0.13507753884419815,
"calib/step_q_w": 0.2951958424507659,
"calib/step_q_w_n": 914.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3002.0,
"completions/max_terminated_length": 3002.0,
"completions/mean_length": 577.0546875,
"completions/mean_terminated_length": 579.3176879882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.043094903230667114,
"kl": 0.0680389404296875,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0796,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03033336251974106,
"mask/share_reasoning": 0.8618874549865723,
"mask/share_step_conf": 0.10387295484542847,
"num_tokens": 31532828.0,
"reward": 0.7787151336669922,
"reward_std": 0.19636546075344086,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.7078253626823425,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8496048450469971,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7835479378700256,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7636693716049194,
"adv/std_final_conf": 0.915665328502655,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.93454509973526,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6835707502374169,
"calib/avg_num_step_conf": 4.62890625,
"calib/ece": 0.3326984126984126,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.40476190476190477,
"calib/gap": 0.2778803418803419,
"calib/mean_conf": 0.4380952380952381,
"calib/mu_c": 0.5671111111111111,
"calib/mu_w": 0.28923076923076924,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1175396825396824,
"calib/std_conf": 0.4661344146456114,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.43978489736070386,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.14602744209231422,
"calib/step_q_w": 0.29375745526838964,
"calib/step_q_w_n": 503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2680.0,
"completions/max_terminated_length": 2680.0,
"completions/mean_length": 550.03515625,
"completions/mean_terminated_length": 550.03515625,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.08178877085447311,
"kl": 0.08254241943359375,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0811,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03342124819755554,
"mask/share_reasoning": 0.8704813718795776,
"mask/share_step_conf": 0.09609738737344742,
"num_tokens": 31782589.0,
"reward": 0.718065619468689,
"reward_std": 0.24075214564800262,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6291484236717224,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8069828152656555,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.6751786470413208,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7397767305374146,
"adv/std_final_conf": 0.8756873607635498,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341914057731628,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6747403748733536,
"calib/avg_num_step_conf": 5.06640625,
"calib/ece": 0.32762845849802363,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3952569169960474,
"calib/gap": 0.3019674518743669,
"calib/mean_conf": 0.4383794466403162,
"calib/mu_c": 0.5720567375886526,
"calib/mu_w": 0.2700892857142857,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.10434782608695645,
"calib/std_conf": 0.46559085489573315,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4196435185185185,
"calib/step_q_c_n": 648.0,
"calib/step_q_gap": 0.11991778662329505,
"calib/step_q_w": 0.29972573189522345,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2447.0,
"completions/max_terminated_length": 2447.0,
"completions/mean_length": 507.140625,
"completions/mean_terminated_length": 509.12945556640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.144,
"grad_norm": 0.06607788056135178,
"kl": 0.06987762451171875,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0134,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03679756075143814,
"mask/share_reasoning": 0.8516077995300293,
"mask/share_step_conf": 0.10768839716911316,
"num_tokens": 32018297.0,
"reward": 0.7513500452041626,
"reward_std": 0.21713411808013916,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6563273072242737,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8463728427886963,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.5889009237289429,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7550169229507446,
"adv/std_final_conf": 0.8102096915245056,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9327640533447266,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7787312484108824,
"calib/avg_num_step_conf": 5.25,
"calib/ece": 0.23396825396825383,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.34523809523809523,
"calib/gap": 0.450045766590389,
"calib/mean_conf": 0.37047619047619046,
"calib/mu_c": 0.6169298245614034,
"calib/mu_w": 0.1668840579710145,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0760317460317459,
"calib/std_conf": 0.45675156865067534,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4747941747572816,
"calib/step_q_c_n": 515.0,
"calib/step_q_gap": 0.22916329417827072,
"calib/step_q_w": 0.24563088057901086,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 505.6171875,
"completions/mean_terminated_length": 505.6171875,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.03182613477110863,
"kl": 0.0801849365234375,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0865,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03527706116437912,
"mask/share_reasoning": 0.8453294634819031,
"mask/share_step_conf": 0.1193934828042984,
"num_tokens": 32256223.0,
"reward": 0.8134219646453857,
"reward_std": 0.16991086304187775,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7480453252792358,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8787985444068909,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6382880210876465,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7336064577102661,
"adv/std_final_conf": 0.8308758735656738,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345904588699341,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7714179958274446,
"calib/avg_num_step_conf": 6.16796875,
"calib/ece": 0.2379098360655738,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.38934426229508196,
"calib/gap": 0.46010969782623334,
"calib/mean_conf": 0.43709016393442623,
"calib/mu_c": 0.657716535433071,
"calib/mu_w": 0.1976068376068376,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07725409836065578,
"calib/std_conf": 0.46236222589657855,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4219350710900474,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.19407037764395862,
"calib/step_q_w": 0.22786469344608878,
"calib/step_q_w_n": 946.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2909.0,
"completions/max_terminated_length": 2909.0,
"completions/mean_length": 532.28515625,
"completions/mean_terminated_length": 536.4763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.07119835913181305,
"kl": 0.07416915893554688,
"learning_rate": 1.75e-06,
"loss": -0.0667,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03368588909506798,
"mask/share_reasoning": 0.8412888050079346,
"mask/share_step_conf": 0.11721283197402954,
"num_tokens": 32499472.0,
"reward": 0.780731737613678,
"reward_std": 0.22005677223205566,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7236812114715576,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8377822637557983,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.6097594499588013,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.768215537071228,
"adv/std_final_conf": 0.8194006085395813,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343672394752502,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7697385881104034,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.23857707509881432,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.44664031620553357,
"calib/gap": 0.48071191613588116,
"calib/mean_conf": 0.4977865612648221,
"calib/mu_c": 0.6801910828025478,
"calib/mu_w": 0.19947916666666665,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05790513833992102,
"calib/std_conf": 0.4651250861308752,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4216188679245283,
"calib/step_q_c_n": 795.0,
"calib/step_q_gap": 0.13706984831668517,
"calib/step_q_w": 0.2845490196078431,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 480.7265625,
"completions/mean_terminated_length": 480.7265625,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1472,
"grad_norm": 0.08591213822364807,
"kl": 0.08078765869140625,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0342,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03658757731318474,
"mask/share_reasoning": 0.847145676612854,
"mask/share_step_conf": 0.11626674234867096,
"num_tokens": 32726874.0,
"reward": 0.8037103414535522,
"reward_std": 0.18178507685661316,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7506031394004822,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8568174839019775,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6061331033706665,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7485789060592651,
"adv/std_final_conf": 0.8287302255630493,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933654248714447,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7235772357723577,
"calib/avg_num_step_conf": 4.9453125,
"calib/ece": 0.29862204724409447,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4409448818897638,
"calib/gap": 0.3605257452574526,
"calib/mean_conf": 0.5006692913385827,
"calib/mu_c": 0.6284146341463415,
"calib/mu_w": 0.26788888888888884,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07681102362204724,
"calib/std_conf": 0.460937021593922,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4152287581699346,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.1379692771320104,
"calib/step_q_w": 0.2772594810379242,
"calib/step_q_w_n": 501.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1586.0,
"completions/max_terminated_length": 1586.0,
"completions/mean_length": 446.5078125,
"completions/mean_terminated_length": 448.25885009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.06311675906181335,
"kl": 0.084686279296875,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0278,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03723848611116409,
"mask/share_reasoning": 0.8437467813491821,
"mask/share_step_conf": 0.11510850489139557,
"num_tokens": 32944276.0,
"reward": 0.7825126051902771,
"reward_std": 0.15829968452453613,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6972042918205261,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8678209781646729,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.5985676646232605,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7452235221862793,
"adv/std_final_conf": 0.8259685039520264,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9336854219436646,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7688974991194083,
"calib/avg_num_step_conf": 4.80078125,
"calib/ece": 0.18932539682539679,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6587301587301587,
"calib/gap": 0.4551680169073617,
"calib/mean_conf": 0.6959920634920636,
"calib/mu_c": 0.8495209580838322,
"calib/mu_w": 0.3943529411764705,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11130952380952376,
"calib/std_conf": 0.4273592406534957,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5014871794871795,
"calib/step_q_c_n": 780.0,
"calib/step_q_gap": 0.19293484095711266,
"calib/step_q_w": 0.3085523385300668,
"calib/step_q_w_n": 449.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2404.0,
"completions/max_terminated_length": 2404.0,
"completions/mean_length": 459.90625,
"completions/mean_terminated_length": 461.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.09437122941017151,
"kl": 0.15157318115234375,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0518,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036479588598012924,
"mask/share_reasoning": 0.8461323380470276,
"mask/share_step_conf": 0.11348183453083038,
"num_tokens": 33167028.0,
"reward": 0.8225182890892029,
"reward_std": 0.19686385989189148,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7759605646133423,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8690760731697083,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.5633289813995361,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7426817417144775,
"adv/std_final_conf": 0.8024340271949768,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339341521263123,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8573802740469407,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.12928853754940706,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5849802371541502,
"calib/gap": 0.6341113824447158,
"calib/mean_conf": 0.6238339920948617,
"calib/mu_c": 0.8519135802469135,
"calib/mu_w": 0.2178021978021978,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.056403162055335895,
"calib/std_conf": 0.45360732887048,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.47482968369829687,
"calib/step_q_c_n": 822.0,
"calib/step_q_gap": 0.212613997423787,
"calib/step_q_w": 0.26221568627450986,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 512.6796875,
"completions/mean_terminated_length": 512.6796875,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1504,
"grad_norm": 0.04484057426452637,
"kl": 0.06557464599609375,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0214,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03456936031579971,
"mask/share_reasoning": 0.8581772446632385,
"mask/share_step_conf": 0.10725339502096176,
"num_tokens": 33405370.0,
"reward": 0.8522858023643494,
"reward_std": 0.16684626042842865,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.8418089747428894,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8627626299858093,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.584743320941925,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7665185332298279,
"adv/std_final_conf": 0.8165276050567627,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341987371444702,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7693518518518518,
"calib/avg_num_step_conf": 5.62109375,
"calib/ece": 0.22447058823529412,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5686274509803921,
"calib/gap": 0.476101851851852,
"calib/mean_conf": 0.6058039215686275,
"calib/mu_c": 0.829851851851852,
"calib/mu_w": 0.35374999999999995,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1504313725490196,
"calib/std_conf": 0.46283698987905836,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5230730337078652,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.21269614237361484,
"calib/step_q_w": 0.31037689133425034,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1626.0,
"completions/max_terminated_length": 1626.0,
"completions/mean_length": 508.7734375,
"completions/mean_terminated_length": 510.7686462402344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.044705916196107864,
"kl": 0.06641387939453125,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0199,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03387663513422012,
"mask/share_reasoning": 0.8435350060462952,
"mask/share_step_conf": 0.11868210136890411,
"num_tokens": 33640776.0,
"reward": 0.8184232711791992,
"reward_std": 0.16215607523918152,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.765038251876831,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8718082904815674,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.622256875038147,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7556319236755371,
"adv/std_final_conf": 0.8251966834068298,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9331852793693542,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.790186673580503,
"calib/avg_num_step_conf": 5.2109375,
"calib/ece": 0.2248152610441768,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.606425702811245,
"calib/gap": 0.41621947109152185,
"calib/mean_conf": 0.6765421686746989,
"calib/mu_c": 0.8704436090225565,
"calib/mu_w": 0.4542241379310346,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.18361044176706837,
"calib/std_conf": 0.4200678077182785,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4771777251184835,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.16781181926969607,
"calib/step_q_w": 0.3093659058487874,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2719.0,
"completions/max_terminated_length": 2719.0,
"completions/mean_length": 507.43359375,
"completions/mean_terminated_length": 511.42913818359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.042031846940517426,
"kl": 0.0727386474609375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.1356,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03369224816560745,
"mask/share_reasoning": 0.8472211360931396,
"mask/share_step_conf": 0.1112741082906723,
"num_tokens": 33878015.0,
"reward": 0.786507785320282,
"reward_std": 0.19533541798591614,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7297797203063965,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.843235969543457,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.5098259449005127,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7644120454788208,
"adv/std_final_conf": 0.7566507458686829,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9335551261901855,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7450912129229912,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.2165217391304348,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6047430830039525,
"calib/gap": 0.423139534883721,
"calib/mean_conf": 0.6461660079051385,
"calib/mu_c": 0.79,
"calib/mu_w": 0.366860465116279,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10130434782608694,
"calib/std_conf": 0.44501001910676913,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49962261904761907,
"calib/step_q_c_n": 840.0,
"calib/step_q_gap": 0.17035817460317465,
"calib/step_q_w": 0.3292644444444444,
"calib/step_q_w_n": 450.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2623.0,
"completions/max_terminated_length": 2623.0,
"completions/mean_length": 491.984375,
"completions/mean_terminated_length": 491.984375,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.1536,
"grad_norm": 0.03865968808531761,
"kl": 0.07563018798828125,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0603,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036693550646305084,
"mask/share_reasoning": 0.8460485935211182,
"mask/share_step_conf": 0.11725786328315735,
"num_tokens": 34108091.0,
"reward": 0.8114147186279297,
"reward_std": 0.16153846681118011,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7582898139953613,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.864539623260498,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.5739729404449463,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7518049478530884,
"adv/std_final_conf": 0.808935821056366,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343997240066528,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6671747967479674,
"calib/avg_num_step_conf": 5.796875,
"calib/ece": 0.2733267716535432,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7440944881889764,
"calib/gap": 0.21942615176151747,
"calib/mean_conf": 0.809232283464567,
"calib/mu_c": 0.886981707317073,
"calib/mu_w": 0.6675555555555556,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21844488188976366,
"calib/std_conf": 0.34307591556340367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5095975473801562,
"calib/step_q_c_n": 897.0,
"calib/step_q_gap": 0.14406943835119534,
"calib/step_q_w": 0.3655281090289608,
"calib/step_q_w_n": 587.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2726.0,
"completions/max_terminated_length": 2726.0,
"completions/mean_length": 478.62890625,
"completions/mean_terminated_length": 478.62890625,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.06480003148317337,
"kl": 0.1052093505859375,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0093,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037643514573574066,
"mask/share_reasoning": 0.8297535181045532,
"mask/share_step_conf": 0.1326029896736145,
"num_tokens": 34333324.0,
"reward": 0.7896007299423218,
"reward_std": 0.17306607961654663,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.721485435962677,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.857715904712677,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.6691650152206421,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7208544015884399,
"adv/std_final_conf": 0.8580728769302368,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354173541069031,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7247138047138046,
"calib/avg_num_step_conf": 5.6328125,
"calib/ece": 0.2909387755102042,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6775510204081633,
"calib/gap": 0.4084545454545453,
"calib/mean_conf": 0.7133877551020407,
"calib/mu_c": 0.9384545454545453,
"calib/mu_w": 0.53,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2776734693877552,
"calib/std_conf": 0.4278884470459194,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5301604278074865,
"calib/step_q_c_n": 561.0,
"calib/step_q_gap": 0.18630798739886,
"calib/step_q_w": 0.3438524404086265,
"calib/step_q_w_n": 881.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 552.32421875,
"completions/mean_terminated_length": 554.490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.03820987418293953,
"kl": 0.0627593994140625,
"learning_rate": 1.5e-06,
"loss": -0.0077,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03305232897400856,
"mask/share_reasoning": 0.851200520992279,
"mask/share_step_conf": 0.11184092611074448,
"num_tokens": 34581935.0,
"reward": 0.7427989840507507,
"reward_std": 0.2363566756248474,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.667646050453186,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8179517984390259,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6376939415931702,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7715792655944824,
"adv/std_final_conf": 0.8261988162994385,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342957139015198,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.67941405237231,
"calib/avg_num_step_conf": 5.41015625,
"calib/ece": 0.3275100401606425,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6947791164658634,
"calib/gap": 0.3267571947109149,
"calib/mean_conf": 0.7224497991967871,
"calib/mu_c": 0.8969827586206895,
"calib/mu_w": 0.5702255639097746,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.29204819277108424,
"calib/std_conf": 0.42023901031809546,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5681996726677578,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.18967254088481206,
"calib/step_q_w": 0.3785271317829458,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 506.42578125,
"completions/mean_terminated_length": 510.41339111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.1568,
"grad_norm": 0.06945173442363739,
"kl": 0.0741424560546875,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0975,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.033999308943748474,
"mask/share_reasoning": 0.8442288637161255,
"mask/share_step_conf": 0.11395932734012604,
"num_tokens": 34815260.0,
"reward": 0.7255409955978394,
"reward_std": 0.21569323539733887,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6502288579940796,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8008532524108887,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.557876467704773,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7473621368408203,
"adv/std_final_conf": 0.8042396306991577,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342260360717773,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7970934256055363,
"calib/avg_num_step_conf": 5.23828125,
"calib/ece": 0.18360784313725492,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6627450980392157,
"calib/gap": 0.4832352941176471,
"calib/mean_conf": 0.7098039215686275,
"calib/mu_c": 0.8708823529411766,
"calib/mu_w": 0.38764705882352946,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11337254901960786,
"calib/std_conf": 0.4221173993193727,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5242913385826772,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": 0.15986655982161524,
"calib/step_q_w": 0.36442477876106194,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 481.53125,
"completions/mean_terminated_length": 481.53125,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.03422103449702263,
"kl": 0.07001495361328125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.022,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03782453387975693,
"mask/share_reasoning": 0.8377889394760132,
"mask/share_step_conf": 0.1243865117430687,
"num_tokens": 35043644.0,
"reward": 0.829301118850708,
"reward_std": 0.18109026551246643,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.8058764934539795,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8527256846427917,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6228979825973511,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7397005558013916,
"adv/std_final_conf": 0.8394556045532227,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934718668460846,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7786641689813318,
"calib/avg_num_step_conf": 5.68359375,
"calib/ece": 0.22928000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.704,
"calib/gap": 0.4324539758413537,
"calib/mean_conf": 0.7425600000000002,
"calib/mu_c": 0.9380291970802919,
"calib/mu_w": 0.5055752212389382,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21192000000000003,
"calib/std_conf": 0.4013694637114288,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5624447949526814,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": 0.20254710920359492,
"calib/step_q_w": 0.3598976857490865,
"calib/step_q_w_n": 821.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 535.34765625,
"completions/mean_terminated_length": 537.4470825195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.08719879388809204,
"kl": 0.06774139404296875,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0734,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0348748117685318,
"mask/share_reasoning": 0.8489192724227905,
"mask/share_step_conf": 0.11229971051216125,
"num_tokens": 35285149.0,
"reward": 0.7994047999382019,
"reward_std": 0.21757547557353973,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7456910610198975,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8531185388565063,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.6043723821640015,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.765477180480957,
"adv/std_final_conf": 0.8277738094329834,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337379336357117,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6841843501326259,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.26562248995983945,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7309236947791165,
"calib/gap": 0.31072015915119355,
"calib/mean_conf": 0.7849799196787149,
"calib/mu_c": 0.9147586206896551,
"calib/mu_w": 0.6040384615384615,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23413654618473906,
"calib/std_conf": 0.37364494077326327,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5627369207772795,
"calib/step_q_c_n": 669.0,
"calib/step_q_gap": 0.1477709343827217,
"calib/step_q_w": 0.41496598639455784,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2276.0,
"completions/max_terminated_length": 2276.0,
"completions/mean_length": 446.51171875,
"completions/mean_terminated_length": 448.2627868652344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.16,
"grad_norm": 0.054330550134181976,
"kl": 0.07338714599609375,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0385,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04155741631984711,
"mask/share_reasoning": 0.8253986835479736,
"mask/share_step_conf": 0.12913760542869568,
"num_tokens": 35504416.0,
"reward": 0.772121012210846,
"reward_std": 0.19345763325691223,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7073625326156616,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8368796110153198,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6646082401275635,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7533245086669922,
"adv/std_final_conf": 0.8691293597221375,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349757432937622,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7634751773049645,
"calib/avg_num_step_conf": 5.0625,
"calib/ece": 0.24605577689243022,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5617529880478087,
"calib/gap": 0.4519587362991618,
"calib/mean_conf": 0.6250199203187251,
"calib/mu_c": 0.8789090909090909,
"calib/mu_w": 0.4269503546099291,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.216414342629482,
"calib/std_conf": 0.4448226957776302,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4954858757062146,
"calib/step_q_c_n": 531.0,
"calib/step_q_gap": 0.1185368560983715,
"calib/step_q_w": 0.3769490196078431,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2085.0,
"completions/max_terminated_length": 2085.0,
"completions/mean_length": 494.80859375,
"completions/mean_terminated_length": 496.7490539550781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.05440312251448631,
"kl": 0.0706787109375,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.041,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033775001764297485,
"mask/share_reasoning": 0.8540380597114563,
"mask/share_step_conf": 0.10828067362308502,
"num_tokens": 35738111.0,
"reward": 0.773845911026001,
"reward_std": 0.2120620310306549,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.72899329662323,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.818698525428772,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7054992914199829,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7607833743095398,
"adv/std_final_conf": 0.9004095196723938,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348529577255249,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7035671519505893,
"calib/avg_num_step_conf": 5.15625,
"calib/ece": 0.3051190476190476,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5277777777777778,
"calib/gap": 0.301888825865003,
"calib/mean_conf": 0.6210714285714285,
"calib/mu_c": 0.7756097560975611,
"calib/mu_w": 0.4737209302325581,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.219047619047619,
"calib/std_conf": 0.4392129085363309,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4830955414012739,
"calib/step_q_c_n": 628.0,
"calib/step_q_gap": 0.12309120614115832,
"calib/step_q_w": 0.3600043352601156,
"calib/step_q_w_n": 692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 488.7265625,
"completions/mean_terminated_length": 488.7265625,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.04921410232782364,
"kl": 0.074859619140625,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.073,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03677456080913544,
"mask/share_reasoning": 0.8412362933158875,
"mask/share_step_conf": 0.1219891905784607,
"num_tokens": 35968617.0,
"reward": 0.7600628137588501,
"reward_std": 0.1953212320804596,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.679622232913971,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.840503454208374,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7010814547538757,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7753474712371826,
"adv/std_final_conf": 0.9040634632110596,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346261620521545,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6131807927811795,
"calib/avg_num_step_conf": 4.796875,
"calib/ece": 0.3587301587301588,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.44047619047619047,
"calib/gap": 0.17447631324524665,
"calib/mean_conf": 0.5193650793650795,
"calib/mu_c": 0.593448275862069,
"calib/mu_w": 0.4189719626168224,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15134920634920643,
"calib/std_conf": 0.455563049548449,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.468414442700157,
"calib/step_q_c_n": 637.0,
"calib/step_q_gap": 0.13057180310624833,
"calib/step_q_w": 0.33784263959390864,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2253.0,
"completions/max_terminated_length": 2253.0,
"completions/mean_length": 466.2109375,
"completions/mean_terminated_length": 468.03924560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1632,
"grad_norm": 0.04134545847773552,
"kl": 0.0821990966796875,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0493,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035583529621362686,
"mask/share_reasoning": 0.8485206961631775,
"mask/share_step_conf": 0.11198952794075012,
"num_tokens": 36195287.0,
"reward": 0.7215287089347839,
"reward_std": 0.19878242909908295,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6165081858634949,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8265491724014282,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6727048754692078,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7404109239578247,
"adv/std_final_conf": 0.8876039981842041,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341225028038025,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7406015037593985,
"calib/avg_num_step_conf": 4.84765625,
"calib/ece": 0.24614173228346453,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.452755905511811,
"calib/gap": 0.4022581246504691,
"calib/mean_conf": 0.5460629921259843,
"calib/mu_c": 0.7566942148760331,
"calib/mu_w": 0.35443609022556394,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15791338582677159,
"calib/std_conf": 0.4529286284710704,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4864110535405871,
"calib/step_q_c_n": 579.0,
"calib/step_q_gap": 0.14947147650131215,
"calib/step_q_w": 0.33693957703927496,
"calib/step_q_w_n": 662.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2264.0,
"completions/max_terminated_length": 2264.0,
"completions/mean_length": 452.73046875,
"completions/mean_terminated_length": 452.73046875,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.04027803987264633,
"kl": 0.08528900146484375,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0591,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03721775487065315,
"mask/share_reasoning": 0.845561146736145,
"mask/share_step_conf": 0.11722112447023392,
"num_tokens": 36415626.0,
"reward": 0.7952397465705872,
"reward_std": 0.1660701334476471,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7354468703269958,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8550325632095337,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.7376800775527954,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7585830688476562,
"adv/std_final_conf": 0.8894400000572205,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341614246368408,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7306756503836797,
"calib/avg_num_step_conf": 4.8125,
"calib/ece": 0.26125984251968504,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3464566929133858,
"calib/gap": 0.36348243808097813,
"calib/mean_conf": 0.4148031496062992,
"calib/mu_c": 0.6108547008547007,
"calib/mu_w": 0.24737226277372262,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10771653543307089,
"calib/std_conf": 0.4486152805147634,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4349628252788104,
"calib/step_q_c_n": 538.0,
"calib/step_q_gap": 0.11674956879466053,
"calib/step_q_w": 0.3182132564841499,
"calib/step_q_w_n": 694.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 971.0,
"completions/max_terminated_length": 971.0,
"completions/mean_length": 420.0,
"completions/mean_terminated_length": 421.6470947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.059748951345682144,
"kl": 0.0921630859375,
"learning_rate": 1.25e-06,
"loss": -0.0633,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03900737315416336,
"mask/share_reasoning": 0.8342753052711487,
"mask/share_step_conf": 0.12281106412410736,
"num_tokens": 36630362.0,
"reward": 0.7750540971755981,
"reward_std": 0.19510385394096375,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7151566743850708,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8349515199661255,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6774182319641113,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7479729056358337,
"adv/std_final_conf": 0.8583827018737793,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9322972297668457,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6803336092188197,
"calib/avg_num_step_conf": 5.5703125,
"calib/ece": 0.3261904761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.45634920634920634,
"calib/gap": 0.29275864264340745,
"calib/mean_conf": 0.5063492063492064,
"calib/mu_c": 0.6376258992805756,
"calib/mu_w": 0.3448672566371681,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14047619047619042,
"calib/std_conf": 0.4690070618729229,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3990714477211797,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.08732880066235615,
"calib/step_q_w": 0.31174264705882354,
"calib/step_q_w_n": 680.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 501.6015625,
"completions/mean_terminated_length": 501.6015625,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.1664,
"grad_norm": 0.0492730550467968,
"kl": 0.08318328857421875,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0033,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03676208108663559,
"mask/share_reasoning": 0.8379840850830078,
"mask/share_step_conf": 0.1252538412809372,
"num_tokens": 36863532.0,
"reward": 0.7542530298233032,
"reward_std": 0.18153919279575348,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6649140119552612,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8435919284820557,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.5950201153755188,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7625734210014343,
"adv/std_final_conf": 0.8319129943847656,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340895414352417,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8078882868937047,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.20484251968503933,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4645669291338583,
"calib/gap": 0.5139125386996902,
"calib/mean_conf": 0.532244094488189,
"calib/mu_c": 0.7386184210526314,
"calib/mu_w": 0.22470588235294117,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06933070866141736,
"calib/std_conf": 0.46117711794783167,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4866437571592211,
"calib/step_q_c_n": 873.0,
"calib/step_q_gap": 0.15837281241768814,
"calib/step_q_w": 0.32827094474153296,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1156.0,
"completions/max_terminated_length": 1156.0,
"completions/mean_length": 460.09765625,
"completions/mean_terminated_length": 461.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.03275960311293602,
"kl": 0.08441925048828125,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0446,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03766844421625137,
"mask/share_reasoning": 0.8255374431610107,
"mask/share_step_conf": 0.1328878104686737,
"num_tokens": 37085045.0,
"reward": 0.8159181475639343,
"reward_std": 0.1707736998796463,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7823695540428162,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8494666814804077,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.6904779672622681,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.754570484161377,
"adv/std_final_conf": 0.8861827254295349,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348433017730713,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6587600139324278,
"calib/avg_num_step_conf": 5.50390625,
"calib/ece": 0.2991269841269841,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.42063492063492064,
"calib/gap": 0.2818557993730407,
"calib/mean_conf": 0.5152380952380952,
"calib/mu_c": 0.6125454545454545,
"calib/mu_w": 0.3306896551724138,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07980158730158732,
"calib/std_conf": 0.44961220377882105,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4330023923444976,
"calib/step_q_c_n": 836.0,
"calib/step_q_gap": 0.10898843073891301,
"calib/step_q_w": 0.3240139616055846,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 481.2265625,
"completions/mean_terminated_length": 481.2265625,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.06318576633930206,
"kl": 0.0872955322265625,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0824,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0401587039232254,
"mask/share_reasoning": 0.8288679122924805,
"mask/share_step_conf": 0.13097335398197174,
"num_tokens": 37313479.0,
"reward": 0.7451825141906738,
"reward_std": 0.2053423523902893,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6652324199676514,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8251326084136963,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.630803108215332,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7304748296737671,
"adv/std_final_conf": 0.8433563113212585,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339118599891663,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6708116883116884,
"calib/avg_num_step_conf": 4.97265625,
"calib/ece": 0.33316,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.444,
"calib/gap": 0.26135064935064933,
"calib/mean_conf": 0.5157200000000001,
"calib/mu_c": 0.6307142857142857,
"calib/mu_w": 0.36936363636363634,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14444000000000004,
"calib/std_conf": 0.4637211248153355,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47020600858369094,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.13975304691121704,
"calib/step_q_w": 0.3304529616724739,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2884.0,
"completions/max_terminated_length": 2884.0,
"completions/mean_length": 454.875,
"completions/mean_terminated_length": 458.4566955566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1696,
"grad_norm": 0.06016838923096657,
"kl": 0.08831024169921875,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0159,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.038090698421001434,
"mask/share_reasoning": 0.8347824215888977,
"mask/share_step_conf": 0.11931438744068146,
"num_tokens": 37534711.0,
"reward": 0.7516264915466309,
"reward_std": 0.19282618165016174,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6458941698074341,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8573589324951172,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.6906576156616211,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7718011736869812,
"adv/std_final_conf": 0.8629789352416992,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352890849113464,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7526358475263585,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.26812244897959187,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3795918367346939,
"calib/gap": 0.40404028115706936,
"calib/mean_conf": 0.44167346938775515,
"calib/mu_c": 0.6197810218978101,
"calib/mu_w": 0.21574074074074076,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.07530612244897958,
"calib/std_conf": 0.4569946807958693,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44134020618556696,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.1309851765997681,
"calib/step_q_w": 0.31035502958579886,
"calib/step_q_w_n": 676.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 513.6171875,
"completions/mean_terminated_length": 515.6314086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.03252209350466728,
"kl": 0.085205078125,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0741,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.034298818558454514,
"mask/share_reasoning": 0.8432285785675049,
"mask/share_step_conf": 0.1185663640499115,
"num_tokens": 37771037.0,
"reward": 0.7507309913635254,
"reward_std": 0.23102092742919922,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6908586025238037,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8106033802032471,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5563451051712036,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7617762088775635,
"adv/std_final_conf": 0.7966976165771484,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341981410980225,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7169324577861163,
"calib/avg_num_step_conf": 5.26953125,
"calib/ece": 0.2749411764705882,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.42745098039215684,
"calib/gap": 0.3759762798177432,
"calib/mean_conf": 0.49125490196078425,
"calib/mu_c": 0.6254268292682926,
"calib/mu_w": 0.24945054945054945,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.06152941176470587,
"calib/std_conf": 0.46211703140142424,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.43188876013904987,
"calib/step_q_c_n": 863.0,
"calib/step_q_gap": 0.11476941857526385,
"calib/step_q_w": 0.317119341563786,
"calib/step_q_w_n": 486.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2411.0,
"completions/max_terminated_length": 2411.0,
"completions/mean_length": 442.19140625,
"completions/mean_terminated_length": 442.19140625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.05513952672481537,
"kl": 0.09521484375,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0572,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03878117725253105,
"mask/share_reasoning": 0.8327029943466187,
"mask/share_step_conf": 0.12851576507091522,
"num_tokens": 37988158.0,
"reward": 0.7575365900993347,
"reward_std": 0.14906707406044006,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6898187398910522,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8252544403076172,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.6162930727005005,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7637404203414917,
"adv/std_final_conf": 0.8308175802230835,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9338586926460266,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7578620464203989,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.2686328125000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.51171875,
"calib/gap": 0.3779320039228506,
"calib/mean_conf": 0.5837890625000001,
"calib/mu_c": 0.7240372670807453,
"calib/mu_w": 0.34610526315789475,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11175781250000004,
"calib/std_conf": 0.45333410954876435,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4803115577889447,
"calib/step_q_c_n": 796.0,
"calib/step_q_gap": 0.12179609387141888,
"calib/step_q_w": 0.3585154639175258,
"calib/step_q_w_n": 485.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1207.0,
"completions/max_terminated_length": 1207.0,
"completions/mean_length": 419.67578125,
"completions/mean_terminated_length": 421.32159423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1728,
"grad_norm": 0.06877302378416061,
"kl": 0.0907135009765625,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0355,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03779768943786621,
"mask/share_reasoning": 0.8321254253387451,
"mask/share_step_conf": 0.12617066502571106,
"num_tokens": 38199739.0,
"reward": 0.7955017685890198,
"reward_std": 0.1514347791671753,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7354754209518433,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8555281162261963,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.6081020832061768,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7466784119606018,
"adv/std_final_conf": 0.8114362359046936,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9338409900665283,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7797653768876791,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.19672000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.444,
"calib/gap": 0.4820027221466071,
"calib/mean_conf": 0.51448,
"calib/mu_c": 0.7284892086330936,
"calib/mu_w": 0.24648648648648652,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07760000000000006,
"calib/std_conf": 0.4561420059586707,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5087028657616892,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.1811028657616892,
"calib/step_q_w": 0.3276,
"calib/step_q_w_n": 650.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 492.984375,
"completions/mean_terminated_length": 496.86614990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.059631235897541046,
"kl": 0.0717620849609375,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0608,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.036981552839279175,
"mask/share_reasoning": 0.8366219401359558,
"mask/share_step_conf": 0.11858400702476501,
"num_tokens": 38430775.0,
"reward": 0.8108958601951599,
"reward_std": 0.185151606798172,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7628577947616577,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8589339256286621,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6528012156486511,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7602795362472534,
"adv/std_final_conf": 0.8653119206428528,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9333934783935547,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7601586687306501,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.21524000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.484,
"calib/gap": 0.42793343653250765,
"calib/mean_conf": 0.58148,
"calib/mu_c": 0.7766176470588235,
"calib/mu_w": 0.3486842105263159,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12636000000000003,
"calib/std_conf": 0.4410839031295519,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4856828885400314,
"calib/step_q_c_n": 637.0,
"calib/step_q_gap": 0.1675082386800874,
"calib/step_q_w": 0.318174649859944,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 508.0390625,
"completions/mean_terminated_length": 510.0314025878906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.0942251980304718,
"kl": 0.1238250732421875,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.034159790724515915,
"mask/share_reasoning": 0.8542314171791077,
"mask/share_step_conf": 0.10770251601934433,
"num_tokens": 38666969.0,
"reward": 0.8032075762748718,
"reward_std": 0.16987591981887817,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7502793073654175,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.856135904788971,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5365115404129028,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.738160252571106,
"adv/std_final_conf": 0.7709892988204956,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342562556266785,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7521900800403354,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.24726190476190468,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.49206349206349204,
"calib/gap": 0.4190754395916051,
"calib/mean_conf": 0.5639285714285714,
"calib/mu_c": 0.7784552845528454,
"calib/mu_w": 0.3593798449612403,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.161547619047619,
"calib/std_conf": 0.45445855938638285,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.46590625,
"calib/step_q_c_n": 640.0,
"calib/step_q_gap": 0.12072669198895025,
"calib/step_q_w": 0.34517955801104977,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2536.0,
"completions/max_terminated_length": 2536.0,
"completions/mean_length": 512.60546875,
"completions/mean_terminated_length": 512.60546875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.176,
"grad_norm": 0.03339356184005737,
"kl": 0.076690673828125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0881,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03438538312911987,
"mask/share_reasoning": 0.8467801809310913,
"mask/share_step_conf": 0.11883437633514404,
"num_tokens": 38903772.0,
"reward": 0.7943029999732971,
"reward_std": 0.1667207032442093,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7338140606880188,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8547918796539307,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.5652823448181152,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7558987140655518,
"adv/std_final_conf": 0.8116322159767151,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9338290095329285,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8307422969187674,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.1436614173228345,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5787401574803149,
"calib/gap": 0.5618151260504202,
"calib/mean_conf": 0.646732283464567,
"calib/mu_c": 0.8325294117647059,
"calib/mu_w": 0.2707142857142857,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.06055118110236205,
"calib/std_conf": 0.4369948728292332,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4977944382647386,
"calib/step_q_c_n": 899.0,
"calib/step_q_gap": 0.1694156231839487,
"calib/step_q_w": 0.3283788150807899,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1569.0,
"completions/max_terminated_length": 1569.0,
"completions/mean_length": 491.43359375,
"completions/mean_terminated_length": 493.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.03352775797247887,
"kl": 0.07314300537109375,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0695,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03505018725991249,
"mask/share_reasoning": 0.8370988368988037,
"mask/share_step_conf": 0.1239447072148323,
"num_tokens": 39135763.0,
"reward": 0.8427225351333618,
"reward_std": 0.1596928834915161,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.8177686929702759,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8676763772964478,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.5534255504608154,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.745092511177063,
"adv/std_final_conf": 0.795141875743866,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934163510799408,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6478658536585366,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.23289682539682524,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7023809523809523,
"calib/gap": 0.2849002217294899,
"calib/mean_conf": 0.7765476190476192,
"calib/mu_c": 0.8760365853658536,
"calib/mu_w": 0.5911363636363637,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1793253968253967,
"calib/std_conf": 0.3697108288091863,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5175310734463276,
"calib/step_q_c_n": 885.0,
"calib/step_q_gap": 0.022448331129542753,
"calib/step_q_w": 0.4950827423167849,
"calib/step_q_w_n": 423.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1950.0,
"completions/max_terminated_length": 1950.0,
"completions/mean_length": 451.62109375,
"completions/mean_terminated_length": 451.62109375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.030953530222177505,
"kl": 0.078460693359375,
"learning_rate": 9.166666666666666e-07,
"loss": -0.022,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037154220044612885,
"mask/share_reasoning": 0.8443901538848877,
"mask/share_step_conf": 0.11845562607049942,
"num_tokens": 39356986.0,
"reward": 0.7836979627609253,
"reward_std": 0.15814566612243652,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.734974205493927,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8324216604232788,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.6224509477615356,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7292006611824036,
"adv/std_final_conf": 0.8391119837760925,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339708089828491,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7255116959064327,
"calib/avg_num_step_conf": 5.51953125,
"calib/ece": 0.23704860557768925,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6533864541832669,
"calib/gap": 0.34066127060074425,
"calib/mean_conf": 0.7330645418326693,
"calib/mu_c": 0.867428947368421,
"calib/mu_w": 0.5267676767676768,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18226772908366537,
"calib/std_conf": 0.39824627205395124,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5217144754316069,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": 0.20040992997706147,
"calib/step_q_w": 0.3213045454545454,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3069.0,
"completions/max_terminated_length": 3069.0,
"completions/mean_length": 533.8125,
"completions/mean_terminated_length": 533.8125,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1792,
"grad_norm": 0.06673835963010788,
"kl": 0.09064483642578125,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0176,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03487485274672508,
"mask/share_reasoning": 0.8501797914505005,
"mask/share_step_conf": 0.11494536697864532,
"num_tokens": 39598314.0,
"reward": 0.79753577709198,
"reward_std": 0.1876133680343628,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7344002723693848,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8606711626052856,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.5384137630462646,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7396717071533203,
"adv/std_final_conf": 0.7766261696815491,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9322170615196228,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7509878819810327,
"calib/avg_num_step_conf": 4.6328125,
"calib/ece": 0.2440399999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.656,
"calib/gap": 0.3588935721812435,
"calib/mean_conf": 0.7184400000000001,
"calib/mu_c": 0.8677397260273974,
"calib/mu_w": 0.5088461538461538,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18923999999999988,
"calib/std_conf": 0.4037984229785946,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6070416024653313,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.20452763598488438,
"calib/step_q_w": 0.40251396648044696,
"calib/step_q_w_n": 537.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1671.0,
"completions/max_terminated_length": 1671.0,
"completions/mean_length": 472.47265625,
"completions/mean_terminated_length": 476.1929016113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.055275809019804,
"kl": 0.080841064453125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.1261,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035248324275016785,
"mask/share_reasoning": 0.8490923643112183,
"mask/share_step_conf": 0.10784684866666794,
"num_tokens": 39823451.0,
"reward": 0.7923622131347656,
"reward_std": 0.18291811645030975,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7293832302093506,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8553411960601807,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.6205732822418213,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7723401784896851,
"adv/std_final_conf": 0.844386637210846,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344258308410645,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7086206896551723,
"calib/avg_num_step_conf": 5.56640625,
"calib/ece": 0.272730923694779,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7068273092369478,
"calib/gap": 0.31264389920424396,
"calib/mean_conf": 0.761004016064257,
"calib/mu_c": 0.8915862068965518,
"calib/mu_w": 0.5789423076923078,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.22570281124497985,
"calib/std_conf": 0.39005534050769003,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5763198959687906,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": 0.20108818865171746,
"calib/step_q_w": 0.37523170731707317,
"calib/step_q_w_n": 656.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 508.46875,
"completions/mean_terminated_length": 510.4627685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.042287420481443405,
"kl": 0.0757293701171875,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0018,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03357735648751259,
"mask/share_reasoning": 0.8427532911300659,
"mask/share_step_conf": 0.11976308375597,
"num_tokens": 40057771.0,
"reward": 0.770465612411499,
"reward_std": 0.2155264914035797,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6976511478424072,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8432800769805908,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.665807843208313,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7515328526496887,
"adv/std_final_conf": 0.8829014301300049,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342430233955383,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.711092509920635,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.3056299212598425,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.594488188976378,
"calib/gap": 0.29211929563492056,
"calib/mean_conf": 0.6565748031496065,
"calib/mu_c": 0.801484375,
"calib/mu_w": 0.5093650793650795,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22913385826771654,
"calib/std_conf": 0.42909224857868916,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.54494227994228,
"calib/step_q_c_n": 693.0,
"calib/step_q_gap": 0.14006921492680008,
"calib/step_q_w": 0.40487306501547987,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2779.0,
"completions/max_terminated_length": 2779.0,
"completions/mean_length": 477.64453125,
"completions/mean_terminated_length": 477.64453125,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1824,
"grad_norm": 0.0317617803812027,
"kl": 0.06999969482421875,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0782,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03570724278688431,
"mask/share_reasoning": 0.8483747243881226,
"mask/share_step_conf": 0.11591806262731552,
"num_tokens": 40286944.0,
"reward": 0.7655474543571472,
"reward_std": 0.19288235902786255,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6793617010116577,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8517332077026367,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.6147419214248657,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7623175382614136,
"adv/std_final_conf": 0.8438313603401184,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339761137962341,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7227416570771,
"calib/avg_num_step_conf": 5.1171875,
"calib/ece": 0.23525609756097562,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7804878048780488,
"calib/gap": 0.27829789988492515,
"calib/mean_conf": 0.8279146341463414,
"calib/mu_c": 0.9274683544303797,
"calib/mu_w": 0.6491704545454545,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21044715447154475,
"calib/std_conf": 0.3294144793671585,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5150586701434159,
"calib/step_q_c_n": 767.0,
"calib/step_q_gap": 0.10485240863328693,
"calib/step_q_w": 0.41020626151012896,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2678.0,
"completions/max_terminated_length": 2678.0,
"completions/mean_length": 486.46875,
"completions/mean_terminated_length": 486.46875,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.0316920168697834,
"kl": 0.07049560546875,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0391,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03700914978981018,
"mask/share_reasoning": 0.8399681448936462,
"mask/share_step_conf": 0.12302268296480179,
"num_tokens": 40514832.0,
"reward": 0.7855240702629089,
"reward_std": 0.20845410227775574,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7256511449813843,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8453969955444336,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.5969452261924744,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7282835841178894,
"adv/std_final_conf": 0.8132307529449463,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352379441261292,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6944219066937118,
"calib/avg_num_step_conf": 5.34765625,
"calib/ece": 0.3042914979757085,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.805668016194332,
"calib/gap": 0.2197221095334685,
"calib/mean_conf": 0.854574898785425,
"calib/mu_c": 0.9453103448275861,
"calib/mu_w": 0.7255882352941176,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.28591093117408906,
"calib/std_conf": 0.3066399487270928,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5201515892420537,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.01553452935094668,
"calib/step_q_w": 0.5046170598911071,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2065.0,
"completions/max_terminated_length": 2065.0,
"completions/mean_length": 489.921875,
"completions/mean_terminated_length": 495.7312316894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.03221871331334114,
"kl": 0.07108306884765625,
"learning_rate": 7.5e-07,
"loss": -0.0237,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.036594174802303314,
"mask/share_reasoning": 0.8292064070701599,
"mask/share_step_conf": 0.12248068302869797,
"num_tokens": 40743412.0,
"reward": 0.7309811115264893,
"reward_std": 0.20213276147842407,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.670396089553833,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7915661334991455,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.6975492238998413,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7517229318618774,
"adv/std_final_conf": 0.8743834495544434,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348888993263245,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6683354591836735,
"calib/avg_num_step_conf": 5.31640625,
"calib/ece": 0.37138888888888894,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5873015873015873,
"calib/gap": 0.21378571428571425,
"calib/mean_conf": 0.7062301587301588,
"calib/mu_c": 0.8250000000000001,
"calib/mu_w": 0.6112142857142858,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.31658730158730164,
"calib/std_conf": 0.4008845311862497,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5150877192982457,
"calib/step_q_c_n": 570.0,
"calib/step_q_gap": 0.08085257391265782,
"calib/step_q_w": 0.4342351453855879,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2395.0,
"completions/max_terminated_length": 2395.0,
"completions/mean_length": 517.17578125,
"completions/mean_terminated_length": 517.17578125,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.1856,
"grad_norm": 0.0422983393073082,
"kl": 0.06575775146484375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0301,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033281851559877396,
"mask/share_reasoning": 0.8519580364227295,
"mask/share_step_conf": 0.1147601306438446,
"num_tokens": 40980041.0,
"reward": 0.7034145593643188,
"reward_std": 0.20858539640903473,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6095452904701233,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.797283947467804,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.625144362449646,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7550072073936462,
"adv/std_final_conf": 0.8507199883460999,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350666999816895,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7613138686131387,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.24036437246963566,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5182186234817814,
"calib/gap": 0.41225414731254134,
"calib/mean_conf": 0.6087044534412956,
"calib/mu_c": 0.8373636363636363,
"calib/mu_w": 0.42510948905109497,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2018623481781377,
"calib/std_conf": 0.43674442180772827,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5487040280210157,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.1899890692459863,
"calib/step_q_w": 0.3587149587750294,
"calib/step_q_w_n": 849.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2944.0,
"completions/max_terminated_length": 2944.0,
"completions/mean_length": 537.10546875,
"completions/mean_terminated_length": 537.10546875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.02944857068359852,
"kl": 0.0681304931640625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.063,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033590368926525116,
"mask/share_reasoning": 0.8462784290313721,
"mask/share_step_conf": 0.12013113498687744,
"num_tokens": 41223364.0,
"reward": 0.7608479857444763,
"reward_std": 0.20738910138607025,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.7097246050834656,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8119714260101318,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6091908812522888,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7560907006263733,
"adv/std_final_conf": 0.8196773529052734,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.933989405632019,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7352638352638352,
"calib/avg_num_step_conf": 5.2578125,
"calib/ece": 0.23609163346613551,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6175298804780877,
"calib/gap": 0.37361383526383535,
"calib/mean_conf": 0.6888406374501992,
"calib/mu_c": 0.8540642857142858,
"calib/mu_w": 0.4804504504504505,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18358167330677297,
"calib/std_conf": 0.4162838042481054,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.533147102526003,
"calib/step_q_c_n": 673.0,
"calib/step_q_gap": 0.1934888558692422,
"calib/step_q_w": 0.33965824665676075,
"calib/step_q_w_n": 673.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 487.98046875,
"completions/mean_terminated_length": 487.98046875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.02829635515809059,
"kl": 0.07366561889648438,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0086,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03810054063796997,
"mask/share_reasoning": 0.837080180644989,
"mask/share_step_conf": 0.12481928616762161,
"num_tokens": 41452351.0,
"reward": 0.7918046712875366,
"reward_std": 0.19538137316703796,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7325851321220398,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8510242700576782,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.6599198579788208,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7703679800033569,
"adv/std_final_conf": 0.8570880889892578,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340048432350159,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7609843546284225,
"calib/avg_num_step_conf": 5.515625,
"calib/ece": 0.251366935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5725806451612904,
"calib/gap": 0.37065202086049526,
"calib/mean_conf": 0.6704798387096775,
"calib/mu_c": 0.8468384615384615,
"calib/mu_w": 0.4761864406779663,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19882661290322587,
"calib/std_conf": 0.41608720733717763,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5185956112852664,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.1675077559881088,
"calib/step_q_w": 0.35108785529715764,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2388.0,
"completions/max_terminated_length": 2388.0,
"completions/mean_length": 501.27734375,
"completions/mean_terminated_length": 507.22137451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.1888,
"grad_norm": 0.06848868727684021,
"kl": 0.1034698486328125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.1139,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03480444848537445,
"mask/share_reasoning": 0.8327597379684448,
"mask/share_step_conf": 0.12071707844734192,
"num_tokens": 41684510.0,
"reward": 0.76722252368927,
"reward_std": 0.18885210156440735,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7177945375442505,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8166505694389343,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6745896339416504,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7268320322036743,
"adv/std_final_conf": 0.8851636648178101,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9334161877632141,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7961158613375755,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.1793951612903225,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6290322580645161,
"calib/gap": 0.4568800954464108,
"calib/mean_conf": 0.7118145161290322,
"calib/mu_c": 0.908936170212766,
"calib/mu_w": 0.4520560747663552,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16133064516129025,
"calib/std_conf": 0.3976578783029185,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5568945686900958,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.2220762432556566,
"calib/step_q_w": 0.3348183254344392,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 471.46875,
"completions/mean_terminated_length": 473.31768798828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.04752740263938904,
"kl": 0.0688934326171875,
"learning_rate": 6.111111111111112e-07,
"loss": -0.1167,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.037102676928043365,
"mask/share_reasoning": 0.8386009335517883,
"mask/share_step_conf": 0.12039016932249069,
"num_tokens": 41911278.0,
"reward": 0.8107980489730835,
"reward_std": 0.19798102974891663,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7743703126907349,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8472259044647217,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.7388206124305725,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7421302795410156,
"adv/std_final_conf": 0.9340841770172119,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341541528701782,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7627878074306645,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.19992031872509955,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5418326693227091,
"calib/gap": 0.3899116954474098,
"calib/mean_conf": 0.6619123505976097,
"calib/mu_c": 0.8234693877551021,
"calib/mu_w": 0.43355769230769237,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13808764940239038,
"calib/std_conf": 0.4118904143496424,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5138596491228069,
"calib/step_q_c_n": 798.0,
"calib/step_q_gap": 0.147195719976269,
"calib/step_q_w": 0.3666639291465379,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2301.0,
"completions/max_terminated_length": 2301.0,
"completions/mean_length": 473.6875,
"completions/mean_terminated_length": 477.4173278808594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.04047829657793045,
"kl": 0.07904815673828125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0288,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034941166639328,
"mask/share_reasoning": 0.8364151120185852,
"mask/share_step_conf": 0.12083126604557037,
"num_tokens": 42138806.0,
"reward": 0.8064365386962891,
"reward_std": 0.19271725416183472,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7548269033432007,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8580461144447327,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.6840267181396484,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.768051028251648,
"adv/std_final_conf": 0.8719254732131958,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345057606697083,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7068607068607068,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.2423412698412697,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.33203482328482326,
"calib/mean_conf": 0.6820238095238096,
"calib/mu_c": 0.8190540540540541,
"calib/mu_w": 0.4870192307692308,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1685317460317459,
"calib/std_conf": 0.4115373223500814,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.479326431181486,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.06518008971807138,
"calib/step_q_w": 0.4141463414634146,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2346.0,
"completions/max_terminated_length": 2346.0,
"completions/mean_length": 536.5859375,
"completions/mean_terminated_length": 538.6902465820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.192,
"grad_norm": 0.057193268090486526,
"kl": 0.08069610595703125,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0664,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032121311873197556,
"mask/share_reasoning": 0.8479681015014648,
"mask/share_step_conf": 0.11600431054830551,
"num_tokens": 42380028.0,
"reward": 0.7702518701553345,
"reward_std": 0.20088596642017365,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7169581651687622,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.823545515537262,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.7163760662078857,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7323043942451477,
"adv/std_final_conf": 0.9061623215675354,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349632859230042,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7605711683580536,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.24403225806451612,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5604838709677419,
"calib/gap": 0.36672911787665885,
"calib/mean_conf": 0.6743548387096775,
"calib/mu_c": 0.8547619047619047,
"calib/mu_w": 0.4880327868852459,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20516129032258063,
"calib/std_conf": 0.41098375671434817,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5386635944700461,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.14001010962156119,
"calib/step_q_w": 0.3986534848484849,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2974.0,
"completions/max_terminated_length": 2974.0,
"completions/mean_length": 479.37109375,
"completions/mean_terminated_length": 479.37109375,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.03152427449822426,
"kl": 0.075347900390625,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0389,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03595711290836334,
"mask/share_reasoning": 0.841357946395874,
"mask/share_step_conf": 0.12268491089344025,
"num_tokens": 42609011.0,
"reward": 0.772752046585083,
"reward_std": 0.23635204136371613,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7130539417266846,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8324500918388367,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.6520901918411255,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7489023804664612,
"adv/std_final_conf": 0.8481500744819641,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345879554748535,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7276109307359306,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.24531599999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.684,
"calib/gap": 0.270469696969697,
"calib/mean_conf": 0.7782760000000001,
"calib/mu_c": 0.8821363636363637,
"calib/mu_w": 0.6116666666666667,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20379599999999998,
"calib/std_conf": 0.3521563173137747,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5275940959409594,
"calib/step_q_c_n": 813.0,
"calib/step_q_gap": 0.15863234730707965,
"calib/step_q_w": 0.36896174863387976,
"calib/step_q_w_n": 549.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3041.0,
"completions/max_terminated_length": 3041.0,
"completions/mean_length": 505.16796875,
"completions/mean_terminated_length": 507.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.038688868284225464,
"kl": 0.06845474243164062,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0739,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03429903835058212,
"mask/share_reasoning": 0.840556263923645,
"mask/share_step_conf": 0.12123845517635345,
"num_tokens": 42844494.0,
"reward": 0.7823052406311035,
"reward_std": 0.19520391523838043,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7236179709434509,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8409925699234009,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6207889318466187,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7769383788108826,
"adv/std_final_conf": 0.8270253539085388,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348618984222412,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.689572192513369,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.2885772357723577,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5487804878048781,
"calib/gap": 0.2507139037433156,
"calib/mean_conf": 0.6483333333333331,
"calib/mu_c": 0.7604411764705883,
"calib/mu_w": 0.5097272727272727,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1920325203252032,
"calib/std_conf": 0.41847291423197214,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5079073033707865,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.11369397003745313,
"calib/step_q_w": 0.39421333333333336,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 510.3359375,
"completions/mean_terminated_length": 514.3543090820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.1952,
"grad_norm": 0.04733794927597046,
"kl": 0.07053375244140625,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.134,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03429635614156723,
"mask/share_reasoning": 0.8464791774749756,
"mask/share_step_conf": 0.11141195148229599,
"num_tokens": 43081820.0,
"reward": 0.7362129092216492,
"reward_std": 0.19574016332626343,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6615542769432068,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8108716011047363,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.6285792589187622,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7463577389717102,
"adv/std_final_conf": 0.8468300104141235,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343642592430115,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7495745887691434,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.1942000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.672,
"calib/gap": 0.3661500283607485,
"calib/mean_conf": 0.7625200000000001,
"calib/mu_c": 0.8884756097560974,
"calib/mu_w": 0.5223255813953489,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1503600000000001,
"calib/std_conf": 0.37392572738446334,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.506656942823804,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.16948271414866795,
"calib/step_q_w": 0.3371742286751361,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 498.54296875,
"completions/mean_terminated_length": 498.54296875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0312087070196867,
"kl": 0.0698089599609375,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0037,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03531767427921295,
"mask/share_reasoning": 0.8455306887626648,
"mask/share_step_conf": 0.11915168166160583,
"num_tokens": 43314727.0,
"reward": 0.8005227446556091,
"reward_std": 0.18379950523376465,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7651569843292236,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8358885049819946,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.5852677822113037,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7569340467453003,
"adv/std_final_conf": 0.7941950559616089,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345303773880005,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8296296296296296,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.21619433198380572,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6396761133603239,
"calib/gap": 0.4553373015873017,
"calib/mean_conf": 0.7179757085020242,
"calib/mu_c": 0.9244444444444445,
"calib/mu_w": 0.46910714285714283,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1938056680161944,
"calib/std_conf": 0.40363323560332265,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5647240356083085,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.2594624376701642,
"calib/step_q_w": 0.30526159793814434,
"calib/step_q_w_n": 776.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2673.0,
"completions/max_terminated_length": 2673.0,
"completions/mean_length": 484.98046875,
"completions/mean_terminated_length": 494.6414489746094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.04685940220952034,
"kl": 0.06523895263671875,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.1251,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03478875011205673,
"mask/share_reasoning": 0.8340494632720947,
"mask/share_step_conf": 0.11163052171468735,
"num_tokens": 43545802.0,
"reward": 0.7864350080490112,
"reward_std": 0.1933988332748413,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7540351748466492,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8188347220420837,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.58069908618927,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7645832300186157,
"adv/std_final_conf": 0.7975696921348572,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9337515830993652,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.779527048914804,
"calib/avg_num_step_conf": 5.71875,
"calib/ece": 0.21015873015873013,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6111111111111112,
"calib/gap": 0.4250068027210885,
"calib/mean_conf": 0.6784920634920635,
"calib/mu_c": 0.8555782312925171,
"calib/mu_w": 0.4305714285714286,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.15265873015873013,
"calib/std_conf": 0.4219758276766112,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5433672086720868,
"calib/step_q_c_n": 738.0,
"calib/step_q_gap": 0.2078892472395799,
"calib/step_q_w": 0.33547796143250685,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 499.56640625,
"completions/mean_terminated_length": 499.56640625,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.1984,
"grad_norm": 0.05687674134969711,
"kl": 0.067626953125,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0359,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03511364758014679,
"mask/share_reasoning": 0.8423900008201599,
"mask/share_step_conf": 0.12249638140201569,
"num_tokens": 43778731.0,
"reward": 0.8191449046134949,
"reward_std": 0.16841217875480652,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7642945051193237,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8739952445030212,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.6501079797744751,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7731779217720032,
"adv/std_final_conf": 0.8458935618400574,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339480996131897,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6654769630110318,
"calib/avg_num_step_conf": 5.8359375,
"calib/ece": 0.2888353413654618,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6024096385542169,
"calib/gap": 0.23776638546398454,
"calib/mean_conf": 0.7121285140562249,
"calib/mu_c": 0.8219402985074628,
"calib/mu_w": 0.5841739130434782,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23140562248995977,
"calib/std_conf": 0.3897047172112078,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5273561643835617,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.1337697769490067,
"calib/step_q_w": 0.393586387434555,
"calib/step_q_w_n": 764.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2303.0,
"completions/max_terminated_length": 2303.0,
"completions/mean_length": 525.34765625,
"completions/mean_terminated_length": 527.4078979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.033688999712467194,
"kl": 0.06855010986328125,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0526,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03296507149934769,
"mask/share_reasoning": 0.8472602367401123,
"mask/share_step_conf": 0.1158684566617012,
"num_tokens": 44014764.0,
"reward": 0.7461711168289185,
"reward_std": 0.20649182796478271,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6634917855262756,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8288504481315613,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.6807498335838318,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7415391206741333,
"adv/std_final_conf": 0.8884475827217102,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349989891052246,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7320123124278569,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.23581673306772918,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5816733067729084,
"calib/gap": 0.35707836347313066,
"calib/mean_conf": 0.6776494023904384,
"calib/mu_c": 0.8384057971014492,
"calib/mu_w": 0.4813274336283186,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.18183266932270925,
"calib/std_conf": 0.40719482081828656,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4818503401360544,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.06637682488725838,
"calib/step_q_w": 0.415473515248796,
"calib/step_q_w_n": 623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 525.28125,
"completions/mean_terminated_length": 527.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.030032223090529442,
"kl": 0.0639801025390625,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.074,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0333237498998642,
"mask/share_reasoning": 0.8468795418739319,
"mask/share_step_conf": 0.11589042842388153,
"num_tokens": 44253308.0,
"reward": 0.7769525051116943,
"reward_std": 0.22362953424453735,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7210996150970459,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8328053951263428,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6388331651687622,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7636686563491821,
"adv/std_final_conf": 0.8439778685569763,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342179298400879,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7751247090123046,
"calib/avg_num_step_conf": 4.953125,
"calib/ece": 0.2163095238095238,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.44047619047619047,
"calib/gap": 0.4233182573994014,
"calib/mean_conf": 0.5520238095238096,
"calib/mu_c": 0.7149677419354838,
"calib/mu_w": 0.29164948453608247,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07662698412698413,
"calib/std_conf": 0.44296470467954524,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5057278911564627,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.19395866038723186,
"calib/step_q_w": 0.3117692307692308,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2642.0,
"completions/max_terminated_length": 2642.0,
"completions/mean_length": 496.68359375,
"completions/mean_terminated_length": 500.594482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.2016,
"grad_norm": 0.03743334114551544,
"kl": 0.07477569580078125,
"learning_rate": 3.055555555555556e-07,
"loss": -0.004,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03554224595427513,
"mask/share_reasoning": 0.8450006246566772,
"mask/share_step_conf": 0.11164465546607971,
"num_tokens": 44488227.0,
"reward": 0.8015495538711548,
"reward_std": 0.1766367256641388,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7476855516433716,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8554134964942932,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6988445520401001,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7438215613365173,
"adv/std_final_conf": 0.9049623012542725,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344231486320496,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.753372061965812,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.23603174603174604,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5396825396825397,
"calib/gap": 0.35544871794871796,
"calib/mean_conf": 0.6442063492063492,
"calib/mu_c": 0.7796153846153846,
"calib/mu_w": 0.42416666666666664,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13059523809523807,
"calib/std_conf": 0.4238941788705319,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47418057142857145,
"calib/step_q_c_n": 875.0,
"calib/step_q_gap": 0.10364353439153445,
"calib/step_q_w": 0.370537037037037,
"calib/step_q_w_n": 540.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2652.0,
"completions/max_terminated_length": 2652.0,
"completions/mean_length": 537.078125,
"completions/mean_terminated_length": 537.078125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.033774811774492264,
"kl": 0.06618499755859375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0439,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03249313682317734,
"mask/share_reasoning": 0.853238046169281,
"mask/share_step_conf": 0.11426880955696106,
"num_tokens": 44731327.0,
"reward": 0.7934824228286743,
"reward_std": 0.18484032154083252,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7358730435371399,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8510918021202087,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6249486804008484,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7530295848846436,
"adv/std_final_conf": 0.8457292318344116,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347898364067078,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6813389227642277,
"calib/avg_num_step_conf": 5.9375,
"calib/ece": 0.3143027888446215,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6294820717131474,
"calib/gap": 0.2817003302845529,
"calib/mean_conf": 0.7018725099601594,
"calib/mu_c": 0.8455284552845529,
"calib/mu_w": 0.563828125,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.26306772908366527,
"calib/std_conf": 0.41427236609269846,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5229447852760737,
"calib/step_q_c_n": 652.0,
"calib/step_q_gap": 0.12827888665856219,
"calib/step_q_w": 0.3946658986175115,
"calib/step_q_w_n": 868.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2774.0,
"completions/max_terminated_length": 2774.0,
"completions/mean_length": 483.26171875,
"completions/mean_terminated_length": 483.26171875,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.04842465743422508,
"kl": 0.0717315673828125,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.017,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03717897832393646,
"mask/share_reasoning": 0.8303982019424438,
"mask/share_step_conf": 0.1324227899312973,
"num_tokens": 44959210.0,
"reward": 0.7430580258369446,
"reward_std": 0.19263812899589539,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6573207378387451,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.828795313835144,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6029754877090454,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7458151578903198,
"adv/std_final_conf": 0.822044312953949,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339615106582642,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.802116935483871,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.15169960474308294,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5217391304347826,
"calib/gap": 0.4747728494623658,
"calib/mean_conf": 0.6448537549407115,
"calib/mu_c": 0.8193750000000002,
"calib/mu_w": 0.3446021505376344,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08207114624505922,
"calib/std_conf": 0.41213908196282156,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47786852085967124,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.11938029816159845,
"calib/step_q_w": 0.3584882226980728,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 496.9609375,
"completions/mean_terminated_length": 496.9609375,
"completions/min_length": 93.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.2048,
"grad_norm": 0.040831126272678375,
"kl": 0.0685577392578125,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0166,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03833800554275513,
"mask/share_reasoning": 0.8447378873825073,
"mask/share_step_conf": 0.11692406237125397,
"num_tokens": 45191408.0,
"reward": 0.8363451957702637,
"reward_std": 0.1784243881702423,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.800956130027771,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8717343211174011,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7382488250732422,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7463130950927734,
"adv/std_final_conf": 0.904547393321991,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350979328155518,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7253340184994861,
"calib/avg_num_step_conf": 5.17578125,
"calib/ece": 0.23023904382470117,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4940239043824701,
"calib/gap": 0.3053943987667011,
"calib/mean_conf": 0.6239442231075698,
"calib/mu_c": 0.7602158273381296,
"calib/mu_w": 0.45482142857142854,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15019920318725097,
"calib/std_conf": 0.4082905536156902,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.47446372239747636,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": 0.11466632731788157,
"calib/step_q_w": 0.3597973950795948,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2999.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 486.83203125,
"completions/mean_terminated_length": 490.66534423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.02862304076552391,
"kl": 0.0765533447265625,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0848,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03511703014373779,
"mask/share_reasoning": 0.8438476324081421,
"mask/share_step_conf": 0.11322282254695892,
"num_tokens": 45421749.0,
"reward": 0.7781213521957397,
"reward_std": 0.2304728627204895,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7139929533004761,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8422497510910034,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.5932942628860474,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7308098673820496,
"adv/std_final_conf": 0.8161402940750122,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343204498291016,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8292651593011305,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.15593625498007982,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5776892430278885,
"calib/gap": 0.5534686536485098,
"calib/mean_conf": 0.6386454183266933,
"calib/mu_c": 0.885611510791367,
"calib/mu_w": 0.3321428571428572,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12039840637450214,
"calib/std_conf": 0.4369896086825614,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5464127423822714,
"calib/step_q_c_n": 722.0,
"calib/step_q_gap": 0.19464887833933758,
"calib/step_q_w": 0.35176386404293386,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 470.05859375,
"completions/mean_terminated_length": 473.75982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.061810459941625595,
"kl": 0.071075439453125,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.002,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03704763948917389,
"mask/share_reasoning": 0.8432860374450684,
"mask/share_step_conf": 0.11185386776924133,
"num_tokens": 45648028.0,
"reward": 0.8355604410171509,
"reward_std": 0.17185883224010468,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.8120867013931274,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8590341806411743,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6721044182777405,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7346723079681396,
"adv/std_final_conf": 0.8902400135993958,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346536993980408,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7843706293706294,
"calib/avg_num_step_conf": 5.3671875,
"calib/ece": 0.18090534979423875,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.5267489711934157,
"calib/gap": 0.4724454545454546,
"calib/mean_conf": 0.6101234567901234,
"calib/mu_c": 0.8045454545454547,
"calib/mu_w": 0.33210000000000006,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1012757201646091,
"calib/std_conf": 0.4387228766957551,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.523060606060606,
"calib/step_q_c_n": 693.0,
"calib/step_q_gap": 0.20959070885062064,
"calib/step_q_w": 0.31346989720998536,
"calib/step_q_w_n": 681.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 514.64453125,
"completions/mean_terminated_length": 516.6627807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.208,
"grad_norm": 0.04098103940486908,
"kl": 0.07430267333984375,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0417,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03490000218153,
"mask/share_reasoning": 0.8435318470001221,
"mask/share_step_conf": 0.11766190826892853,
"num_tokens": 45885761.0,
"reward": 0.7753250002861023,
"reward_std": 0.21513205766677856,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7455902099609375,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8050597906112671,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.5985764265060425,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7298129796981812,
"adv/std_final_conf": 0.8426317572593689,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353283047676086,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7379987129987129,
"calib/avg_num_step_conf": 4.90625,
"calib/ece": 0.22306772908366523,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6613545816733067,
"calib/gap": 0.3861351351351352,
"calib/mean_conf": 0.7302390438247013,
"calib/mu_c": 0.9010000000000001,
"calib/mu_w": 0.5148648648648649,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19776892430278875,
"calib/std_conf": 0.3945898352430957,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.548481089258699,
"calib/step_q_c_n": 661.0,
"calib/step_q_gap": 0.13627940858643012,
"calib/step_q_w": 0.41220168067226887,
"calib/step_q_w_n": 595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1573.0,
"completions/max_terminated_length": 1573.0,
"completions/mean_length": 402.6015625,
"completions/mean_terminated_length": 407.3755187988281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.04002168029546738,
"kl": 0.08524322509765625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0302,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03996644169092178,
"mask/share_reasoning": 0.824980616569519,
"mask/share_step_conf": 0.12333419919013977,
"num_tokens": 46091371.0,
"reward": 0.7817299365997314,
"reward_std": 0.2106354832649231,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7396625280380249,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8237974047660828,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.675631046295166,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.738574743270874,
"adv/std_final_conf": 0.8583427667617798,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342004060745239,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8050909090909091,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.1962560975609756,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.47560975609756095,
"calib/gap": 0.44174671074380156,
"calib/mean_conf": 0.6146300813008131,
"calib/mu_c": 0.831912,
"calib/mu_w": 0.3901652892561984,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15137804878048777,
"calib/std_conf": 0.42352444635002,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4821104387291981,
"calib/step_q_c_n": 661.0,
"calib/step_q_gap": 0.1184095515302121,
"calib/step_q_w": 0.363700887198986,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3059.0,
"completions/max_terminated_length": 3059.0,
"completions/mean_length": 508.63671875,
"completions/mean_terminated_length": 512.6417236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.038508955389261246,
"kl": 0.07263565063476562,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0688,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03412006050348282,
"mask/share_reasoning": 0.8306388854980469,
"mask/share_step_conf": 0.1274285465478897,
"num_tokens": 46326638.0,
"reward": 0.7938829064369202,
"reward_std": 0.19586849212646484,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7496906518936157,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8380751609802246,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.6216574907302856,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7498562335968018,
"adv/std_final_conf": 0.848364531993866,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340881109237671,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8403380102040816,
"calib/avg_num_step_conf": 5.9921875,
"calib/ece": 0.15583333333333332,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49206349206349204,
"calib/gap": 0.5348571428571429,
"calib/mean_conf": 0.5803571428571429,
"calib/mu_c": 0.8180714285714286,
"calib/mu_w": 0.2832142857142857,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09031746031746035,
"calib/std_conf": 0.4407932417503575,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49728750000000005,
"calib/step_q_c_n": 800.0,
"calib/step_q_gap": 0.15315943460490467,
"calib/step_q_w": 0.3441280653950954,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 471.89453125,
"completions/mean_terminated_length": 473.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.2112,
"grad_norm": 0.04384619742631912,
"kl": 0.08667755126953125,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0126,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03809526562690735,
"mask/share_reasoning": 0.82027268409729,
"mask/share_step_conf": 0.13772578537464142,
"num_tokens": 46552827.0,
"reward": 0.8343632221221924,
"reward_std": 0.15485632419586182,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.809451162815094,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8592753410339355,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6543087959289551,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7627103328704834,
"adv/std_final_conf": 0.8575649261474609,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934026300907135,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7413175213396754,
"calib/avg_num_step_conf": 5.3515625,
"calib/ece": 0.21360714285714302,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5674603174603174,
"calib/gap": 0.35828025021176774,
"calib/mean_conf": 0.6854325396825396,
"calib/mu_c": 0.8318724832214766,
"calib/mu_w": 0.4735922330097088,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1538849206349208,
"calib/std_conf": 0.4021402259437888,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5151649076517151,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.1486289599392968,
"calib/step_q_w": 0.36653594771241826,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2310.0,
"completions/max_terminated_length": 2310.0,
"completions/mean_length": 504.8828125,
"completions/mean_terminated_length": 504.8828125,
"completions/min_length": 111.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.05067715048789978,
"kl": 0.0714569091796875,
"learning_rate": 2.777777777777778e-08,
"loss": -0.027,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03707854449748993,
"mask/share_reasoning": 0.8380366563796997,
"mask/share_step_conf": 0.12488484382629395,
"num_tokens": 46786277.0,
"reward": 0.8067346811294556,
"reward_std": 0.18213021755218506,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7490285038948059,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8644406795501709,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5960859060287476,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7385691404342651,
"adv/std_final_conf": 0.8230341672897339,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934851348400116,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8526765188834153,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.1406,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.576,
"calib/gap": 0.5908275862068966,
"calib/mean_conf": 0.63268,
"calib/mu_c": 0.8808275862068965,
"calib/mu_w": 0.29,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09664,
"calib/std_conf": 0.44494181372399694,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.55555587808418,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.23116214986118355,
"calib/step_q_w": 0.3243937282229965,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 471.0859375,
"completions/mean_terminated_length": 472.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.033435944467782974,
"kl": 0.08514785766601562,
"learning_rate": 0.0,
"loss": -0.0914,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03687979280948639,
"mask/share_reasoning": 0.8445696234703064,
"mask/share_step_conf": 0.11464434862136841,
"num_tokens": 47014923.0,
"reward": 0.8379338383674622,
"reward_std": 0.18549615144729614,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.8198585510253906,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8560090661048889,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 2.6277464703912847,
"train_runtime": 14184.3282,
"train_samples_per_second": 3.61,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 47014923,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}