Files
PureRL-1.5B-v7-s2-l2-kl-w0-b0/trainer_state.json
ModelHub XC 6c22d6153b 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w0-b0
Source: Original Platform
2026-06-01 06:44:20 +08:00

12043 lines
496 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7490277290344238,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5698086470140988,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9343300461769104,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04297444224357605,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.8933746814727783,
"reward_std": 0.19672557711601257,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.7698483467102051,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5081617139634353,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9345317482948303,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.0404808484017849,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.8337589502334595,
"reward_std": 0.1928534209728241,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7833774089813232,
"adv/mean_abs_reasoning": 0.44472596049308777,
"adv/mean_abs_step_conf": 0.7741047143936157,
"adv/ratio_final_to_reasoning": 1.761483427036185,
"adv/ratio_step_to_reasoning": 1.7406330710609537,
"adv/std_final_conf": 0.9287951588630676,
"adv/std_reasoning": 0.7013906240463257,
"adv/std_step_conf": 0.9333337545394897,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4731928514270472,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.2416862745098039,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": -0.0023299546545747507,
"calib/mean_conf": 0.8809019607843137,
"calib/mu_c": 0.8800613496932514,
"calib/mu_w": 0.8823913043478262,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2416862745098039,
"calib/std_conf": 0.04315464889957421,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.798605577689243,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": 0.042623926313096194,
"calib/step_q_w": 0.7559816513761468,
"calib/step_q_w_n": 545.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2204.0,
"completions/max_terminated_length": 2204.0,
"completions/mean_length": 501.80078125,
"completions/mean_terminated_length": 501.80078125,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.05636599287390709,
"learning_rate": 7.5e-07,
"loss": 0.0098,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03296242654323578,
"mask/share_reasoning": 0.855893075466156,
"mask/share_step_conf": 0.11114451289176941,
"num_tokens": 692378.0,
"reward": 0.8860100507736206,
"reward_std": 0.17941723763942719,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7003523707389832,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.746667742729187,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7777718305587769,
"adv/mean_abs_reasoning": 0.4602840542793274,
"adv/mean_abs_step_conf": 0.7568343877792358,
"adv/ratio_final_to_reasoning": 1.689764881767508,
"adv/ratio_step_to_reasoning": 1.6442767911311225,
"adv/std_final_conf": 0.930294394493103,
"adv/std_reasoning": 0.720619261264801,
"adv/std_step_conf": 0.9336671233177185,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.44653713983610893,
"calib/avg_num_step_conf": 4.9296875,
"calib/ece": 0.261897233201581,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.25296442687747034,
"calib/gap": -0.00561260904044425,
"calib/mean_conf": 0.878498023715415,
"calib/mu_c": 0.8763461538461537,
"calib/mu_w": 0.8819587628865979,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.261897233201581,
"calib/std_conf": 0.04157137055533821,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7959510869565216,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.017224851215076775,
"calib/step_q_w": 0.7787262357414448,
"calib/step_q_w_n": 526.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 498.515625,
"completions/mean_terminated_length": 500.4706115722656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.04587812349200249,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0175,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033491168171167374,
"mask/share_reasoning": 0.8485321998596191,
"mask/share_step_conf": 0.1140703409910202,
"num_tokens": 926166.0,
"reward": 0.8646347522735596,
"reward_std": 0.1897309273481369,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6825304627418518,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7272076606750488,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7645877599716187,
"adv/mean_abs_reasoning": 0.4408244490623474,
"adv/mean_abs_step_conf": 0.773873507976532,
"adv/ratio_final_to_reasoning": 1.7344495333639722,
"adv/ratio_step_to_reasoning": 1.755514036534485,
"adv/std_final_conf": 0.9312360286712646,
"adv/std_reasoning": 0.7204880714416504,
"adv/std_step_conf": 0.9340111017227173,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.46828609986504716,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.3475200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.292,
"calib/gap": 0.0034355118565644327,
"calib/mean_conf": 0.87952,
"calib/mu_c": 0.8811278195488723,
"calib/mu_w": 0.8776923076923079,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3475200000000001,
"calib/std_conf": 0.05802559435283711,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7969736842105262,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.03234120557804765,
"calib/step_q_w": 0.7646324786324785,
"calib/step_q_w_n": 585.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1943.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 515.2890625,
"completions/mean_terminated_length": 517.309814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.037411727011203766,
"learning_rate": 1.25e-06,
"loss": -0.0572,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03343050926923752,
"mask/share_reasoning": 0.850313663482666,
"mask/share_step_conf": 0.11234962195158005,
"num_tokens": 1164768.0,
"reward": 0.7938537001609802,
"reward_std": 0.17242825031280518,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6070008277893066,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.6846128106117249,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.767395555973053,
"adv/mean_abs_reasoning": 0.43761974573135376,
"adv/mean_abs_step_conf": 0.7543257474899292,
"adv/ratio_final_to_reasoning": 1.753567025844721,
"adv/ratio_step_to_reasoning": 1.7237013522534128,
"adv/std_final_conf": 0.9321431517601013,
"adv/std_reasoning": 0.7205691337585449,
"adv/std_step_conf": 0.934429407119751,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4851431138493133,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.29980237154150197,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3438735177865613,
"calib/gap": -0.002955333076626765,
"calib/mean_conf": 0.8808300395256917,
"calib/mu_c": 0.879591836734694,
"calib/mu_w": 0.8825471698113208,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29980237154150197,
"calib/std_conf": 0.04231099296039837,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7867022696929239,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": -0.009205321066152128,
"calib/step_q_w": 0.795907590759076,
"calib/step_q_w_n": 606.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2506.0,
"completions/max_terminated_length": 2506.0,
"completions/mean_length": 459.4765625,
"completions/mean_terminated_length": 461.2784729003906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.0064,
"grad_norm": 0.04331325367093086,
"learning_rate": 1.5e-06,
"loss": -0.0298,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03640598803758621,
"mask/share_reasoning": 0.8320022821426392,
"mask/share_step_conf": 0.1276855170726776,
"num_tokens": 1388346.0,
"reward": 0.8372479677200317,
"reward_std": 0.19077688455581665,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6518738269805908,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7109032273292542,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7665672898292542,
"adv/mean_abs_reasoning": 0.511370062828064,
"adv/mean_abs_step_conf": 0.7519409656524658,
"adv/ratio_final_to_reasoning": 1.4990460833586072,
"adv/ratio_step_to_reasoning": 1.4704438533103728,
"adv/std_final_conf": 0.9309672713279724,
"adv/std_reasoning": 0.7575966119766235,
"adv/std_step_conf": 0.9342412352561951,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.45781045751633986,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.28166007905138346,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": -0.00788823529411764,
"calib/mean_conf": 0.8825296442687747,
"calib/mu_c": 0.8794117647058824,
"calib/mu_w": 0.8873000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27972332015810286,
"calib/std_conf": 0.04346553892207595,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7818596059113301,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.009118226600985158,
"calib/step_q_w": 0.7727413793103449,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 518.484375,
"completions/mean_terminated_length": 522.5669555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.04959415644407272,
"learning_rate": 1.75e-06,
"loss": 0.0708,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.031138010323047638,
"mask/share_reasoning": 0.8528178334236145,
"mask/share_step_conf": 0.10823164880275726,
"num_tokens": 1628502.0,
"reward": 0.8639912605285645,
"reward_std": 0.1998172402381897,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6701984405517578,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7405965328216553,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7736358046531677,
"adv/mean_abs_reasoning": 0.48989439010620117,
"adv/mean_abs_step_conf": 0.7445034384727478,
"adv/ratio_final_to_reasoning": 1.5791889441425446,
"adv/ratio_step_to_reasoning": 1.5197223187457842,
"adv/std_final_conf": 0.9313343167304993,
"adv/std_reasoning": 0.7574949264526367,
"adv/std_step_conf": 0.9349920153617859,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.3862290862290862,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.3273306772908367,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.2549800796812749,
"calib/gap": -0.019002574002574102,
"calib/mean_conf": 0.8776892430278885,
"calib/mu_c": 0.8692857142857142,
"calib/mu_w": 0.8882882882882883,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.32362549800796814,
"calib/std_conf": 0.05307514634165364,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7813258785942492,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": -0.005805658648223022,
"calib/step_q_w": 0.7871315372424722,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 531.32421875,
"completions/mean_terminated_length": 533.4078979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.036499813199043274,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0265,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03259691596031189,
"mask/share_reasoning": 0.8604029417037964,
"mask/share_step_conf": 0.10309390723705292,
"num_tokens": 1871033.0,
"reward": 0.8223322033882141,
"reward_std": 0.1828649342060089,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6219609379768372,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7187970876693726,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.8049112558364868,
"adv/mean_abs_reasoning": 0.442771852016449,
"adv/mean_abs_step_conf": 0.7661532163619995,
"adv/ratio_final_to_reasoning": 1.817891657228889,
"adv/ratio_step_to_reasoning": 1.73035664501441,
"adv/std_final_conf": 0.9294659495353699,
"adv/std_reasoning": 0.7015198469161987,
"adv/std_step_conf": 0.9343773126602173,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4814764183185236,
"calib/avg_num_step_conf": 5.03125,
"calib/ece": 0.26160642570281123,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.2931726907630522,
"calib/gap": 0.000470266575529954,
"calib/mean_conf": 0.8800803212851405,
"calib/mu_c": 0.8802597402597403,
"calib/mu_w": 0.8797894736842103,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.26160642570281123,
"calib/std_conf": 0.04436062482015925,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7749525101763906,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.05956231053936689,
"calib/step_q_w": 0.7153901996370237,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2953.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 541.3359375,
"completions/mean_terminated_length": 543.4588623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0096,
"grad_norm": 0.05957135930657387,
"learning_rate": 2.25e-06,
"loss": 0.0745,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03317445516586304,
"mask/share_reasoning": 0.8594812154769897,
"mask/share_step_conf": 0.10343807935714722,
"num_tokens": 2117151.0,
"reward": 0.8370152115821838,
"reward_std": 0.21288388967514038,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6695988178253174,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.6911502480506897,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.785231351852417,
"adv/mean_abs_reasoning": 0.4677700400352478,
"adv/mean_abs_step_conf": 0.7750120162963867,
"adv/ratio_final_to_reasoning": 1.6786696125156875,
"adv/ratio_step_to_reasoning": 1.656822690563909,
"adv/std_final_conf": 0.9293057918548584,
"adv/std_reasoning": 0.720548152923584,
"adv/std_step_conf": 0.9337735772132874,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.48350653704324503,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.2613888888888889,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.34523809523809523,
"calib/gap": -0.0022225947033186477,
"calib/mean_conf": 0.8844047619047619,
"calib/mu_c": 0.8835668789808919,
"calib/mu_w": 0.8857894736842106,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2613888888888889,
"calib/std_conf": 0.0453387039135699,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7862021857923497,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.00939590306983662,
"calib/step_q_w": 0.7768062827225131,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2854.0,
"completions/max_terminated_length": 2854.0,
"completions/mean_length": 509.8984375,
"completions/mean_terminated_length": 511.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.04875704273581505,
"learning_rate": 2.5e-06,
"loss": 0.0244,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03302618861198425,
"mask/share_reasoning": 0.8515468835830688,
"mask/share_step_conf": 0.11152061820030212,
"num_tokens": 2354485.0,
"reward": 0.869525671005249,
"reward_std": 0.17988115549087524,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6828699111938477,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7366502285003662,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7646981477737427,
"adv/mean_abs_reasoning": 0.4427228271961212,
"adv/mean_abs_step_conf": 0.7652941942214966,
"adv/ratio_final_to_reasoning": 1.727261620135503,
"adv/ratio_step_to_reasoning": 1.728607939799047,
"adv/std_final_conf": 0.9307788014411926,
"adv/std_reasoning": 0.7205932140350342,
"adv/std_step_conf": 0.9334892630577087,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4243208692873123,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.320748031496063,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3937007874015748,
"calib/gap": -0.02912815596037066,
"calib/mean_conf": 0.8799606299212598,
"calib/mu_c": 0.8679194630872483,
"calib/mu_w": 0.897047619047619,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30704724409448825,
"calib/std_conf": 0.09864235099908569,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7714735516372797,
"calib/step_q_c_n": 794.0,
"calib/step_q_gap": -0.011438262006646993,
"calib/step_q_w": 0.7829118136439267,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2984.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 527.08203125,
"completions/mean_terminated_length": 529.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.04273128882050514,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0182,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03175712376832962,
"mask/share_reasoning": 0.848095178604126,
"mask/share_step_conf": 0.11624141037464142,
"num_tokens": 2593898.0,
"reward": 0.8353488445281982,
"reward_std": 0.18423444032669067,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6413179636001587,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7153171300888062,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7719697952270508,
"adv/mean_abs_reasoning": 0.4082704186439514,
"adv/mean_abs_step_conf": 0.762624979019165,
"adv/ratio_final_to_reasoning": 1.890829606982322,
"adv/ratio_step_to_reasoning": 1.8679408161683217,
"adv/std_final_conf": 0.9258157014846802,
"adv/std_reasoning": 0.681792676448822,
"adv/std_step_conf": 0.9339790940284729,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.471031746031746,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.24112449799196795,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.42971887550200805,
"calib/gap": -0.00808441558441575,
"calib/mean_conf": 0.8933333333333333,
"calib/mu_c": 0.8906060606060605,
"calib/mu_w": 0.8986904761904763,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23590361445783142,
"calib/std_conf": 0.0485423399560318,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7722249690976515,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.020690158971069295,
"calib/step_q_w": 0.7515348101265822,
"calib/step_q_w_n": 632.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 482.05078125,
"completions/mean_terminated_length": 485.8464660644531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.0128,
"grad_norm": 0.03738216683268547,
"learning_rate": 3e-06,
"loss": 0.0016,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03594135120511055,
"mask/share_reasoning": 0.8288105726242065,
"mask/share_step_conf": 0.1274355798959732,
"num_tokens": 2821479.0,
"reward": 0.8925774097442627,
"reward_std": 0.18727068603038788,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6938386559486389,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7686598300933838,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7644088268280029,
"adv/mean_abs_reasoning": 0.40752342343330383,
"adv/mean_abs_step_conf": 0.7455885410308838,
"adv/ratio_final_to_reasoning": 1.8757420625985386,
"adv/ratio_step_to_reasoning": 1.8295599667608025,
"adv/std_final_conf": 0.9257137179374695,
"adv/std_reasoning": 0.6815844178199768,
"adv/std_step_conf": 0.9348680973052979,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5062893081761006,
"calib/avg_num_step_conf": 4.67578125,
"calib/ece": 0.2684705882352941,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4745098039215686,
"calib/gap": 0.010391116352201357,
"calib/mean_conf": 0.892,
"calib/mu_c": 0.8959119496855346,
"calib/mu_w": 0.8855208333333332,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2684705882352941,
"calib/std_conf": 0.05516748118585945,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7643037974683544,
"calib/step_q_c_n": 711.0,
"calib/step_q_gap": 0.022390217221440967,
"calib/step_q_w": 0.7419135802469135,
"calib/step_q_w_n": 486.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 474.00390625,
"completions/mean_terminated_length": 474.00390625,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.04036780446767807,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0452,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035814374685287476,
"mask/share_reasoning": 0.8521537780761719,
"mask/share_step_conf": 0.11203181743621826,
"num_tokens": 3047416.0,
"reward": 0.890767514705658,
"reward_std": 0.1727554202079773,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6923031210899353,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7657943964004517,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7648312449455261,
"adv/mean_abs_reasoning": 0.4855183959007263,
"adv/mean_abs_step_conf": 0.7825720906257629,
"adv/ratio_final_to_reasoning": 1.575287880753978,
"adv/ratio_step_to_reasoning": 1.61182788795046,
"adv/std_final_conf": 0.9267758131027222,
"adv/std_reasoning": 0.7393484115600586,
"adv/std_step_conf": 0.9343113899230957,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.451698717948718,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.39339999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.616,
"calib/gap": -0.004999999999999893,
"calib/mean_conf": 0.9134,
"calib/mu_c": 0.911,
"calib/mu_w": 0.9159999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39339999999999997,
"calib/std_conf": 0.03798999868386414,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7223162274618585,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.021849835143614382,
"calib/step_q_w": 0.7004663923182441,
"calib/step_q_w_n": 729.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2203.0,
"completions/max_terminated_length": 2203.0,
"completions/mean_length": 535.94921875,
"completions/mean_terminated_length": 542.3043823242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.046021830290555954,
"learning_rate": 3.5e-06,
"loss": -0.0558,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0320635549724102,
"mask/share_reasoning": 0.840599775314331,
"mask/share_step_conf": 0.11561790108680725,
"num_tokens": 3290019.0,
"reward": 0.8165234923362732,
"reward_std": 0.19196359813213348,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5778292417526245,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7583426237106323,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.764102578163147,
"adv/mean_abs_reasoning": 0.351371169090271,
"adv/mean_abs_step_conf": 0.7570402026176453,
"adv/ratio_final_to_reasoning": 2.174630833091604,
"adv/ratio_step_to_reasoning": 2.1545313594672124,
"adv/std_final_conf": 0.9198284149169922,
"adv/std_reasoning": 0.6185474991798401,
"adv/std_step_conf": 0.9339830279350281,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5321258758758759,
"calib/avg_num_step_conf": 4.71484375,
"calib/ece": 0.3382812499999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.66015625,
"calib/gap": 0.007077077077077054,
"calib/mean_conf": 0.9164062499999999,
"calib/mu_c": 0.9193918918918919,
"calib/mu_w": 0.9123148148148148,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3382812499999999,
"calib/std_conf": 0.04166897780048725,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6848280802292264,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": -0.021380171244250956,
"calib/step_q_w": 0.7062082514734773,
"calib/step_q_w_n": 509.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 947.0,
"completions/max_terminated_length": 947.0,
"completions/mean_length": 449.796875,
"completions/mean_terminated_length": 451.5608215332031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.016,
"grad_norm": 0.035176970064640045,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0255,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03521725535392761,
"mask/share_reasoning": 0.8471869230270386,
"mask/share_step_conf": 0.11368949711322784,
"num_tokens": 3513047.0,
"reward": 0.8684121370315552,
"reward_std": 0.153379887342453,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.643385112285614,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.7778139114379883,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.776970624923706,
"adv/mean_abs_reasoning": 0.40734297037124634,
"adv/mean_abs_step_conf": 0.7721484303474426,
"adv/ratio_final_to_reasoning": 1.907411398840605,
"adv/ratio_step_to_reasoning": 1.8955732306948074,
"adv/std_final_conf": 0.9214528203010559,
"adv/std_reasoning": 0.6816308498382568,
"adv/std_step_conf": 0.9345147609710693,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.548993288590604,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.33036144578313253,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7710843373493976,
"calib/gap": 0.0064422818791946,
"calib/mean_conf": 0.9287550200803213,
"calib/mu_c": 0.9313422818791945,
"calib/mu_w": 0.9248999999999999,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.33036144578313253,
"calib/std_conf": 0.038854793847339794,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6253367346938775,
"calib/step_q_c_n": 980.0,
"calib/step_q_gap": -0.005105163833127424,
"calib/step_q_w": 0.6304418985270049,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2467.0,
"completions/max_terminated_length": 2467.0,
"completions/mean_length": 623.08203125,
"completions/mean_terminated_length": 627.9881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.042664941400289536,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0754,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0263189896941185,
"mask/share_reasoning": 0.8583731651306152,
"mask/share_step_conf": 0.10749533772468567,
"num_tokens": 3781404.0,
"reward": 0.8671663999557495,
"reward_std": 0.16728359460830688,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6342976689338684,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7890976667404175,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7509101629257202,
"adv/mean_abs_reasoning": 0.4348292946815491,
"adv/mean_abs_step_conf": 0.7716478109359741,
"adv/ratio_final_to_reasoning": 1.726907943209428,
"adv/ratio_step_to_reasoning": 1.774599412629494,
"adv/std_final_conf": 0.9179096221923828,
"adv/std_reasoning": 0.7013534307479858,
"adv/std_step_conf": 0.934508740901947,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6060933448573897,
"calib/avg_num_step_conf": 5.61328125,
"calib/ece": 0.23671874999999998,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8046875,
"calib/gap": 0.01822529530394701,
"calib/mean_conf": 0.93203125,
"calib/mu_c": 0.9375842696629213,
"calib/mu_w": 0.9193589743589743,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23671874999999998,
"calib/std_conf": 0.04428881375062442,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6054375614552606,
"calib/step_q_c_n": 1017.0,
"calib/step_q_gap": 0.014270894788594002,
"calib/step_q_w": 0.5911666666666666,
"calib/step_q_w_n": 420.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1540.0,
"completions/max_terminated_length": 1540.0,
"completions/mean_length": 501.57421875,
"completions/mean_terminated_length": 503.54119873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.049540191888809204,
"learning_rate": 4.25e-06,
"loss": 0.0029,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03329847380518913,
"mask/share_reasoning": 0.840636134147644,
"mask/share_step_conf": 0.12215914577245712,
"num_tokens": 4013335.0,
"reward": 0.9493035078048706,
"reward_std": 0.1720583438873291,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.72954922914505,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8323389291763306,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7602880001068115,
"adv/mean_abs_reasoning": 0.4945884346961975,
"adv/mean_abs_step_conf": 0.7385061979293823,
"adv/ratio_final_to_reasoning": 1.5372134622877318,
"adv/ratio_step_to_reasoning": 1.4931732044705253,
"adv/std_final_conf": 0.918209433555603,
"adv/std_reasoning": 0.7575898766517639,
"adv/std_step_conf": 0.9352489709854126,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.46421066456527454,
"calib/avg_num_step_conf": 5.13671875,
"calib/ece": 0.37911646586345377,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.891566265060241,
"calib/gap": -0.0016154452324664836,
"calib/mean_conf": 0.9453815261044177,
"calib/mu_c": 0.9446808510638297,
"calib/mu_w": 0.9462962962962962,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.37911646586345377,
"calib/std_conf": 0.0393822149644015,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6216381766381767,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.044199351189563196,
"calib/step_q_w": 0.5774388254486135,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2924.0,
"completions/max_terminated_length": 2924.0,
"completions/mean_length": 518.8203125,
"completions/mean_terminated_length": 522.905517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.0192,
"grad_norm": 0.027683550491929054,
"learning_rate": 4.5e-06,
"loss": -0.0408,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0321136899292469,
"mask/share_reasoning": 0.8539448976516724,
"mask/share_step_conf": 0.10612886399030685,
"num_tokens": 4256873.0,
"reward": 0.8278531432151794,
"reward_std": 0.2118024230003357,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5909234285354614,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7616577744483948,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7561960816383362,
"adv/mean_abs_reasoning": 0.4372809827327728,
"adv/mean_abs_step_conf": 0.7717266082763672,
"adv/ratio_final_to_reasoning": 1.7293138999837454,
"adv/ratio_step_to_reasoning": 1.7648300263448176,
"adv/std_final_conf": 0.9090594053268433,
"adv/std_reasoning": 0.6816985607147217,
"adv/std_step_conf": 0.9350162148475647,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5666752395036327,
"calib/avg_num_step_conf": 4.6328125,
"calib/ece": 0.3575590551181102,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9291338582677166,
"calib/gap": 0.012099273452066961,
"calib/mean_conf": 0.9520472440944882,
"calib/mu_c": 0.9569536423841057,
"calib/mu_w": 0.9448543689320388,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3575590551181102,
"calib/std_conf": 0.043927975816489856,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5789985486211902,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.013404987252980893,
"calib/step_q_w": 0.5655935613682093,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2766.0,
"completions/max_terminated_length": 2766.0,
"completions/mean_length": 495.1953125,
"completions/mean_terminated_length": 495.1953125,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.029574958607554436,
"learning_rate": 4.75e-06,
"loss": 0.012,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031995534896850586,
"mask/share_reasoning": 0.8613891005516052,
"mask/share_step_conf": 0.10661540180444717,
"num_tokens": 4488403.0,
"reward": 0.8745803833007812,
"reward_std": 0.1895061880350113,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6300226449966431,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8027318716049194,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7200021743774414,
"adv/mean_abs_reasoning": 0.45562490820884705,
"adv/mean_abs_step_conf": 0.7624484300613403,
"adv/ratio_final_to_reasoning": 1.5802520042372452,
"adv/ratio_step_to_reasoning": 1.673412529307667,
"adv/std_final_conf": 0.8881762623786926,
"adv/std_reasoning": 0.7392408847808838,
"adv/std_step_conf": 0.9350560903549194,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49484369964550434,
"calib/avg_num_step_conf": 5.5546875,
"calib/ece": 0.39174603174603184,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9841269841269841,
"calib/gap": -0.000243635191749636,
"calib/mean_conf": 0.9650000000000002,
"calib/mu_c": 0.9648965517241381,
"calib/mu_w": 0.9651401869158878,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3906746031746033,
"calib/std_conf": 0.028984122637736764,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5484782608695652,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.012501584484725492,
"calib/step_q_w": 0.5359766763848397,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 494.953125,
"completions/mean_terminated_length": 494.953125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.027805835008621216,
"learning_rate": 5e-06,
"loss": 0.0766,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03540211543440819,
"mask/share_reasoning": 0.8361475467681885,
"mask/share_step_conf": 0.12845034897327423,
"num_tokens": 4719983.0,
"reward": 0.8451118469238281,
"reward_std": 0.19244298338890076,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5935140252113342,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7865532040596008,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.730783224105835,
"adv/mean_abs_reasoning": 0.5323715209960938,
"adv/mean_abs_step_conf": 0.7476698756217957,
"adv/ratio_final_to_reasoning": 1.3726940591008756,
"adv/ratio_step_to_reasoning": 1.4044137338955847,
"adv/std_final_conf": 0.9159253835678101,
"adv/std_reasoning": 0.7927213907241821,
"adv/std_step_conf": 0.935133695602417,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5736628088833282,
"calib/avg_num_step_conf": 5.69921875,
"calib/ece": 0.4265748031496064,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.984251968503937,
"calib/gap": 0.004594932749452663,
"calib/mean_conf": 0.9738188976377953,
"calib/mu_c": 0.9758992805755397,
"calib/mu_w": 0.971304347826087,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4265748031496064,
"calib/std_conf": 0.0204087820076742,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5161962365591397,
"calib/step_q_c_n": 744.0,
"calib/step_q_gap": 0.00516127152417456,
"calib/step_q_w": 0.5110349650349651,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2188.0,
"completions/max_terminated_length": 2188.0,
"completions/mean_length": 500.796875,
"completions/mean_terminated_length": 500.796875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.0224,
"grad_norm": 0.027695661410689354,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0169,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033906854689121246,
"mask/share_reasoning": 0.8413227796554565,
"mask/share_step_conf": 0.12477035075426102,
"num_tokens": 4951147.0,
"reward": 0.8373119831085205,
"reward_std": 0.213922381401062,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5676566362380981,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7999359965324402,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.6912540197372437,
"adv/mean_abs_reasoning": 0.38511404395103455,
"adv/mean_abs_step_conf": 0.7476215362548828,
"adv/ratio_final_to_reasoning": 1.7949332946817524,
"adv/ratio_step_to_reasoning": 1.9412990723078887,
"adv/std_final_conf": 0.8591064214706421,
"adv/std_reasoning": 0.6612229943275452,
"adv/std_step_conf": 0.934601366519928,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.48827392120075047,
"calib/avg_num_step_conf": 5.75390625,
"calib/ece": 0.33431372549019595,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9921568627450981,
"calib/gap": -0.00037523452157595116,
"calib/mean_conf": 0.9774509803921568,
"calib/mu_c": 0.9773170731707316,
"calib/mu_w": 0.9776923076923075,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33431372549019595,
"calib/std_conf": 0.018916030887293125,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5184591194968553,
"calib/step_q_c_n": 954.0,
"calib/step_q_gap": 0.014644090595121173,
"calib/step_q_w": 0.5038150289017341,
"calib/step_q_w_n": 519.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 485.66796875,
"completions/mean_terminated_length": 485.66796875,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.03960420563817024,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0118,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03314411640167236,
"mask/share_reasoning": 0.8393899202346802,
"mask/share_step_conf": 0.12746594846248627,
"num_tokens": 5177294.0,
"reward": 0.899250328540802,
"reward_std": 0.16604021191596985,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6556215286254883,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8155354261398315,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.6920627355575562,
"adv/mean_abs_reasoning": 0.4975743293762207,
"adv/mean_abs_step_conf": 0.7552143335342407,
"adv/ratio_final_to_reasoning": 1.3908730710146442,
"adv/ratio_step_to_reasoning": 1.517791993974062,
"adv/std_final_conf": 0.8740708231925964,
"adv/std_reasoning": 0.7574687600135803,
"adv/std_step_conf": 0.9346686005592346,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.47294472114616,
"calib/avg_num_step_conf": 5.5625,
"calib/ece": 0.44109374999999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.99609375,
"calib/gap": -0.0029650126053004655,
"calib/mean_conf": 0.9825781250000002,
"calib/mu_c": 0.9812230215827337,
"calib/mu_w": 0.9841880341880341,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44035156249999996,
"calib/std_conf": 0.015499299064292392,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5508701298701298,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.03620652130743862,
"calib/step_q_w": 0.5146636085626912,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 502.3828125,
"completions/mean_terminated_length": 504.35296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.031366974115371704,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0028,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034005146473646164,
"mask/share_reasoning": 0.8374161720275879,
"mask/share_step_conf": 0.12467247992753983,
"num_tokens": 5409840.0,
"reward": 0.8348518013954163,
"reward_std": 0.1989361047744751,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5568780899047852,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8042316436767578,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7143040895462036,
"adv/mean_abs_reasoning": 0.5833674669265747,
"adv/mean_abs_step_conf": 0.765878438949585,
"adv/ratio_final_to_reasoning": 1.2244496480228118,
"adv/ratio_step_to_reasoning": 1.3128576452584078,
"adv/std_final_conf": 0.9137648940086365,
"adv/std_reasoning": 0.8265848159790039,
"adv/std_step_conf": 0.9351579546928406,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5699480138629699,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.4768979591836736,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.003844974673420265,
"calib/mean_conf": 0.9830204081632654,
"calib/mu_c": 0.9849193548387096,
"calib/mu_w": 0.9810743801652894,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.4768979591836736,
"calib/std_conf": 0.011457979655793888,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5617678100263852,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.03691368502638526,
"calib/step_q_w": 0.524854125,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 612.45703125,
"completions/mean_terminated_length": 612.45703125,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0256,
"grad_norm": 0.036034706979990005,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0124,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03095119819045067,
"mask/share_reasoning": 0.8476190567016602,
"mask/share_step_conf": 0.12142970412969589,
"num_tokens": 5671141.0,
"reward": 0.7759683728218079,
"reward_std": 0.24071954190731049,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5017863512039185,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7626503705978394,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.6553305983543396,
"adv/mean_abs_reasoning": 0.42382392287254333,
"adv/mean_abs_step_conf": 0.7621168494224548,
"adv/ratio_final_to_reasoning": 1.5462331477485223,
"adv/ratio_step_to_reasoning": 1.798192146061671,
"adv/std_final_conf": 0.8428783416748047,
"adv/std_reasoning": 0.7013589143753052,
"adv/std_step_conf": 0.9344834089279175,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5387245961438248,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.3846245059288538,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007731891610215369,
"calib/mean_conf": 0.9854150197628458,
"calib/mu_c": 0.9857236842105263,
"calib/mu_w": 0.9849504950495047,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3846245059288538,
"calib/std_conf": 0.009257284905080909,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5470713391739674,
"calib/step_q_c_n": 799.0,
"calib/step_q_gap": 0.01965523179141715,
"calib/step_q_w": 0.5274161073825503,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2119.0,
"completions/max_terminated_length": 2119.0,
"completions/mean_length": 486.20703125,
"completions/mean_terminated_length": 488.11376953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.046743251383304596,
"learning_rate": 4.861111111111111e-06,
"loss": -0.03,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03313041105866432,
"mask/share_reasoning": 0.8396614789962769,
"mask/share_step_conf": 0.12330187857151031,
"num_tokens": 5898834.0,
"reward": 0.8585910797119141,
"reward_std": 0.18948256969451904,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6053300499916077,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7954457402229309,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.5678051710128784,
"adv/mean_abs_reasoning": 0.40700194239616394,
"adv/mean_abs_step_conf": 0.7359786033630371,
"adv/ratio_final_to_reasoning": 1.3950920422394282,
"adv/ratio_step_to_reasoning": 1.808292606738119,
"adv/std_final_conf": 0.7967524528503418,
"adv/std_reasoning": 0.6817211508750916,
"adv/std_step_conf": 0.9349690675735474,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5277665043290043,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.37139999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0006831709956709453,
"calib/mean_conf": 0.9873999999999999,
"calib/mu_c": 0.9876623376623378,
"calib/mu_w": 0.9869791666666669,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.37139999999999995,
"calib/std_conf": 0.007158212067269318,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5790322580645161,
"calib/step_q_c_n": 744.0,
"calib/step_q_gap": 0.07202919996054047,
"calib/step_q_w": 0.5070030581039756,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2547.0,
"completions/max_terminated_length": 2547.0,
"completions/mean_length": 517.3203125,
"completions/mean_terminated_length": 521.3936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.04296651855111122,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0238,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030154578387737274,
"mask/share_reasoning": 0.8507611751556396,
"mask/share_step_conf": 0.1112716943025589,
"num_tokens": 6136508.0,
"reward": 0.862993597984314,
"reward_std": 0.19204775989055634,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6108921766281128,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8002512454986572,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.5804013013839722,
"adv/mean_abs_reasoning": 0.4634118974208832,
"adv/mean_abs_step_conf": 0.7674976587295532,
"adv/ratio_final_to_reasoning": 1.2524523099518872,
"adv/ratio_step_to_reasoning": 1.656188939043339,
"adv/std_final_conf": 0.8160413503646851,
"adv/std_reasoning": 0.7392897605895996,
"adv/std_step_conf": 0.9348019957542419,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5466738881101059,
"calib/avg_num_step_conf": 6.0234375,
"calib/ece": 0.4584462151394423,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.001320249776984861,
"calib/mean_conf": 0.9883266932270917,
"calib/mu_c": 0.9889473684210525,
"calib/mu_w": 0.9876271186440676,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4584462151394423,
"calib/std_conf": 0.006708860513081049,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5329224904701397,
"calib/step_q_c_n": 787.0,
"calib/step_q_gap": -0.0029583042318469532,
"calib/step_q_w": 0.5358807947019867,
"calib/step_q_w_n": 755.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3051.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 521.30859375,
"completions/mean_terminated_length": 525.4133911132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.0288,
"grad_norm": 0.03022969514131546,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0168,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032159917056560516,
"mask/share_reasoning": 0.8321437835693359,
"mask/share_step_conf": 0.12788382172584534,
"num_tokens": 6375179.0,
"reward": 0.8111745119094849,
"reward_std": 0.2072419822216034,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5307597517967224,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7915892004966736,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.6053963899612427,
"adv/mean_abs_reasoning": 0.4087706208229065,
"adv/mean_abs_step_conf": 0.7480028867721558,
"adv/ratio_final_to_reasoning": 1.4810173704326006,
"adv/ratio_step_to_reasoning": 1.82988416649497,
"adv/std_final_conf": 0.7858245968818665,
"adv/std_reasoning": 0.6818886995315552,
"adv/std_step_conf": 0.9349893927574158,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5225119381490184,
"calib/avg_num_step_conf": 5.0625,
"calib/ece": 0.3102032520325204,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00048737967103795743,
"calib/mean_conf": 0.9890650406504066,
"calib/mu_c": 0.9892215568862275,
"calib/mu_w": 0.9887341772151895,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3102032520325204,
"calib/std_conf": 0.007066743178062233,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5377925211097708,
"calib/step_q_c_n": 829.0,
"calib/step_q_gap": 0.03231072239456745,
"calib/step_q_w": 0.5054817987152034,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2992.0,
"completions/max_terminated_length": 2992.0,
"completions/mean_length": 573.41015625,
"completions/mean_terminated_length": 573.41015625,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.04057210683822632,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0274,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02998589724302292,
"mask/share_reasoning": 0.865739107131958,
"mask/share_step_conf": 0.10427501797676086,
"num_tokens": 6628916.0,
"reward": 0.8797591924667358,
"reward_std": 0.2143753170967102,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6552281379699707,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.7824151515960693,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.6272884607315063,
"adv/mean_abs_reasoning": 0.4339601397514343,
"adv/mean_abs_step_conf": 0.7357267141342163,
"adv/ratio_final_to_reasoning": 1.445497877041904,
"adv/ratio_step_to_reasoning": 1.6953785537898232,
"adv/std_final_conf": 0.8206400275230408,
"adv/std_reasoning": 0.7014197111129761,
"adv/std_step_conf": 0.9344742894172668,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5213920817369093,
"calib/avg_num_step_conf": 5.9765625,
"calib/ece": 0.45179282868525916,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0010964240102171452,
"calib/mean_conf": 0.9896414342629484,
"calib/mu_c": 0.9901481481481481,
"calib/mu_w": 0.989051724137931,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45179282868525916,
"calib/std_conf": 0.006991095384555814,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5233561643835616,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.06595616438356156,
"calib/step_q_w": 0.45740000000000003,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2913.0,
"completions/max_terminated_length": 2913.0,
"completions/mean_length": 621.78125,
"completions/mean_terminated_length": 624.2196655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.050297658890485764,
"learning_rate": 4.75e-06,
"loss": -0.0194,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.027649879455566406,
"mask/share_reasoning": 0.8580390214920044,
"mask/share_step_conf": 0.11040481925010681,
"num_tokens": 6895220.0,
"reward": 0.831365704536438,
"reward_std": 0.18563544750213623,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5371124744415283,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8240563869476318,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7028975486755371,
"adv/mean_abs_reasoning": 0.5859472155570984,
"adv/mean_abs_step_conf": 0.7569370269775391,
"adv/ratio_final_to_reasoning": 1.199591925711681,
"adv/ratio_step_to_reasoning": 1.2918177727969395,
"adv/std_final_conf": 0.86234050989151,
"adv/std_reasoning": 0.8099966645240784,
"adv/std_step_conf": 0.9351039528846741,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6026850507982584,
"calib/avg_num_step_conf": 6.09765625,
"calib/ece": 0.41361445783132533,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0034272331442141146,
"calib/mean_conf": 0.9879116465863454,
"calib/mu_c": 0.9893706293706291,
"calib/mu_w": 0.985943396226415,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.41361445783132533,
"calib/std_conf": 0.007089235072647791,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.48403614457831323,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.028659947587889145,
"calib/step_q_w": 0.4553761969904241,
"calib/step_q_w_n": 731.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2724.0,
"completions/max_terminated_length": 2724.0,
"completions/mean_length": 637.62890625,
"completions/mean_terminated_length": 637.62890625,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.032,
"grad_norm": 0.037932589650154114,
"learning_rate": 4.722222222222222e-06,
"loss": 0.034,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.027265505865216255,
"mask/share_reasoning": 0.8616552352905273,
"mask/share_step_conf": 0.11107931286096573,
"num_tokens": 7165437.0,
"reward": 0.829302966594696,
"reward_std": 0.25291186571121216,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5660597681999207,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7878586053848267,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.6084252595901489,
"adv/mean_abs_reasoning": 0.4179837703704834,
"adv/mean_abs_step_conf": 0.7556965351104736,
"adv/ratio_final_to_reasoning": 1.4556193391213876,
"adv/ratio_step_to_reasoning": 1.8079566449210593,
"adv/std_final_conf": 0.8313267827033997,
"adv/std_reasoning": 0.7013306617736816,
"adv/std_step_conf": 0.9342578053474426,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.47851465474416294,
"calib/avg_num_step_conf": 6.32421875,
"calib/ece": 0.5090157480314961,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002893691008444277,
"calib/mean_conf": 0.9893307086614174,
"calib/mu_c": 0.9891803278688525,
"calib/mu_w": 0.9894696969696969,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5090157480314961,
"calib/std_conf": 0.005677077985938627,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4703526093088857,
"calib/step_q_c_n": 709.0,
"calib/step_q_gap": 0.05286909282536928,
"calib/step_q_w": 0.41748351648351645,
"calib/step_q_w_n": 910.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2382.0,
"completions/max_terminated_length": 2382.0,
"completions/mean_length": 589.484375,
"completions/mean_terminated_length": 589.484375,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.06749963015317917,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0608,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.028801191598176956,
"mask/share_reasoning": 0.8561272621154785,
"mask/share_step_conf": 0.11507159471511841,
"num_tokens": 7422257.0,
"reward": 0.803888201713562,
"reward_std": 0.1765424907207489,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.48727697134017944,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8267495036125183,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.6257480382919312,
"adv/mean_abs_reasoning": 0.44627708196640015,
"adv/mean_abs_step_conf": 0.7526584267616272,
"adv/ratio_final_to_reasoning": 1.4021514067779157,
"adv/ratio_step_to_reasoning": 1.6865271759984624,
"adv/std_final_conf": 0.8275420665740967,
"adv/std_reasoning": 0.7205285429954529,
"adv/std_step_conf": 0.9348967671394348,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5414493293591655,
"calib/avg_num_step_conf": 5.81640625,
"calib/ece": 0.46748031496063003,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.001327620466964552,
"calib/mean_conf": 0.9871653543307087,
"calib/mu_c": 0.9878030303030302,
"calib/mu_w": 0.9864754098360656,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46748031496063003,
"calib/std_conf": 0.006568552830858001,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5145,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.11222014475271408,
"calib/step_q_w": 0.4022798552472859,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 551.62890625,
"completions/mean_terminated_length": 551.62890625,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.04086047038435936,
"learning_rate": 4.666666666666667e-06,
"loss": 0.033,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030038587749004364,
"mask/share_reasoning": 0.8557687401771545,
"mask/share_step_conf": 0.11419267952442169,
"num_tokens": 7670178.0,
"reward": 0.8285809755325317,
"reward_std": 0.192602276802063,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5283093452453613,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8272901773452759,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.6068408489227295,
"adv/mean_abs_reasoning": 0.4078954756259918,
"adv/mean_abs_step_conf": 0.7853239178657532,
"adv/ratio_final_to_reasoning": 1.4877361608177164,
"adv/ratio_step_to_reasoning": 1.925306763112601,
"adv/std_final_conf": 0.798178493976593,
"adv/std_reasoning": 0.6613113284111023,
"adv/std_step_conf": 0.9336666464805603,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5599961479198767,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.5142000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0017398561890086262,
"calib/mean_conf": 0.9862000000000001,
"calib/mu_c": 0.9871186440677965,
"calib/mu_w": 0.9853787878787879,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5142000000000001,
"calib/std_conf": 0.007124605252222757,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.509986013986014,
"calib/step_q_c_n": 715.0,
"calib/step_q_gap": 0.03103934731934732,
"calib/step_q_w": 0.47894666666666663,
"calib/step_q_w_n": 750.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 576.09375,
"completions/mean_terminated_length": 578.3529663085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0352,
"grad_norm": 0.06224135309457779,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0111,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.029109252616763115,
"mask/share_reasoning": 0.8572879433631897,
"mask/share_step_conf": 0.10969658195972443,
"num_tokens": 7924530.0,
"reward": 0.777721643447876,
"reward_std": 0.16657274961471558,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.47578006982803345,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7921631336212158,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7188222408294678,
"adv/mean_abs_reasoning": 0.48082005977630615,
"adv/mean_abs_step_conf": 0.7605365514755249,
"adv/ratio_final_to_reasoning": 1.4949922038691321,
"adv/ratio_step_to_reasoning": 1.5817487977297626,
"adv/std_final_conf": 0.8647890686988831,
"adv/std_reasoning": 0.7206701040267944,
"adv/std_step_conf": 0.93436199426651,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4588960657662948,
"calib/avg_num_step_conf": 5.96875,
"calib/ece": 0.4540322580645162,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0015502055196713327,
"calib/mean_conf": 0.9822580645161291,
"calib/mu_c": 0.9815267175572517,
"calib/mu_w": 0.9830769230769231,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4540322580645162,
"calib/std_conf": 0.009361788297375138,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5020348058902275,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.04005017080700091,
"calib/step_q_w": 0.4619846350832266,
"calib/step_q_w_n": 781.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 525.140625,
"completions/mean_terminated_length": 529.2755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.06206882745027542,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03148376941680908,
"mask/share_reasoning": 0.8310195207595825,
"mask/share_step_conf": 0.1296842396259308,
"num_tokens": 8164078.0,
"reward": 0.8166903257369995,
"reward_std": 0.2020808607339859,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5267976522445679,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8097079396247864,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.641136884689331,
"adv/mean_abs_reasoning": 0.4908233880996704,
"adv/mean_abs_step_conf": 0.7527388334274292,
"adv/ratio_final_to_reasoning": 1.306247624367763,
"adv/ratio_step_to_reasoning": 1.5336246227829962,
"adv/std_final_conf": 0.8167731761932373,
"adv/std_reasoning": 0.7394091486930847,
"adv/std_step_conf": 0.9343294501304626,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5396810506566605,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.4589723320158102,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0011894934333956941,
"calib/mean_conf": 0.9728063241106719,
"calib/mu_c": 0.9733846153846153,
"calib/mu_w": 0.9721951219512196,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4589723320158102,
"calib/std_conf": 0.007252506689845704,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5793015873015873,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.09160043787629996,
"calib/step_q_w": 0.48770114942528736,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2625.0,
"completions/max_terminated_length": 2625.0,
"completions/mean_length": 572.7890625,
"completions/mean_terminated_length": 575.0353393554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.06816098839044571,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0431,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02892582304775715,
"mask/share_reasoning": 0.868759274482727,
"mask/share_step_conf": 0.09840866923332214,
"num_tokens": 8419968.0,
"reward": 0.8196977376937866,
"reward_std": 0.2199438214302063,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.53374844789505,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8064281940460205,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.4820092022418976,
"adv/mean_abs_reasoning": 0.343949556350708,
"adv/mean_abs_step_conf": 0.747795820236206,
"adv/ratio_final_to_reasoning": 1.4013950398889805,
"adv/ratio_step_to_reasoning": 2.1741438720558093,
"adv/std_final_conf": 0.7245348691940308,
"adv/std_reasoning": 0.640119731426239,
"adv/std_step_conf": 0.9329690337181091,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5227504244482173,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.21769841269841272,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007130730050936007,
"calib/mean_conf": 0.9716666666666667,
"calib/mu_c": 0.971842105263158,
"calib/mu_w": 0.9711290322580644,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21769841269841272,
"calib/std_conf": 0.0053079754308590415,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6222383419689119,
"calib/step_q_c_n": 965.0,
"calib/step_q_gap": 0.021569737317749138,
"calib/step_q_w": 0.6006686046511628,
"calib/step_q_w_n": 344.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 513.18359375,
"completions/mean_terminated_length": 513.18359375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0384,
"grad_norm": 0.05701802670955658,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0635,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03495626896619797,
"mask/share_reasoning": 0.8420665264129639,
"mask/share_step_conf": 0.12297721952199936,
"num_tokens": 8654055.0,
"reward": 0.9756142497062683,
"reward_std": 0.15324629843235016,
"rewards/accuracy_reward_step": 0.74609375,
"rewards/final_brier_reward_step": 0.7553539276123047,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8497809171676636,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.5717979073524475,
"adv/mean_abs_reasoning": 0.3663122057914734,
"adv/mean_abs_step_conf": 0.7703781127929688,
"adv/ratio_final_to_reasoning": 1.5609578340885226,
"adv/ratio_step_to_reasoning": 2.103064273079433,
"adv/std_final_conf": 0.7795237302780151,
"adv/std_reasoning": 0.6403769254684448,
"adv/std_step_conf": 0.9331871271133423,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5274299344066786,
"calib/avg_num_step_conf": 5.15234375,
"calib/ece": 0.4965040650406506,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005326972768836535,
"calib/mean_conf": 0.9721138211382115,
"calib/mu_c": 0.9723931623931625,
"calib/mu_w": 0.9718604651162789,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4965040650406506,
"calib/std_conf": 0.005808884773299599,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6934782608695652,
"calib/step_q_c_n": 506.0,
"calib/step_q_gap": 0.14440077009465746,
"calib/step_q_w": 0.5490774907749078,
"calib/step_q_w_n": 813.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2781.0,
"completions/max_terminated_length": 2781.0,
"completions/mean_length": 512.2578125,
"completions/mean_terminated_length": 518.33203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.05277741327881813,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0848,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03219691663980484,
"mask/share_reasoning": 0.8508118391036987,
"mask/share_step_conf": 0.10527247190475464,
"num_tokens": 8892289.0,
"reward": 0.7643100619316101,
"reward_std": 0.17383795976638794,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.48461097478866577,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7604154348373413,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.6224730610847473,
"adv/mean_abs_reasoning": 0.40885424613952637,
"adv/mean_abs_step_conf": 0.7654492855072021,
"adv/ratio_final_to_reasoning": 1.5224815859496321,
"adv/ratio_step_to_reasoning": 1.872181328027552,
"adv/std_final_conf": 0.8149917721748352,
"adv/std_reasoning": 0.7014630436897278,
"adv/std_step_conf": 0.9333143830299377,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5132510013351135,
"calib/avg_num_step_conf": 4.8359375,
"calib/ece": 0.40668016194332,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003718291054739531,
"calib/mean_conf": 0.9734817813765184,
"calib/mu_c": 0.9736428571428571,
"calib/mu_w": 0.9732710280373832,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.40668016194332,
"calib/std_conf": 0.007311924711082845,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.670029542097489,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.056250575965581584,
"calib/step_q_w": 0.6137789661319074,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 536.30078125,
"completions/mean_terminated_length": 540.5236206054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.06288687139749527,
"learning_rate": 4.5e-06,
"loss": 0.035,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03290407732129097,
"mask/share_reasoning": 0.8492652773857117,
"mask/share_step_conf": 0.11001814901828766,
"num_tokens": 9136470.0,
"reward": 0.8197071552276611,
"reward_std": 0.1978277713060379,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5682578086853027,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7695938348770142,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.70289146900177,
"adv/mean_abs_reasoning": 0.47861048579216003,
"adv/mean_abs_step_conf": 0.7290204763412476,
"adv/ratio_final_to_reasoning": 1.4686085864550105,
"adv/ratio_step_to_reasoning": 1.5232020567510713,
"adv/std_final_conf": 0.862726628780365,
"adv/std_reasoning": 0.7575022578239441,
"adv/std_step_conf": 0.9335668087005615,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5760536398467433,
"calib/avg_num_step_conf": 5.33984375,
"calib/ece": 0.4377290836653387,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0031577266922091507,
"calib/mean_conf": 0.9755776892430279,
"calib/mu_c": 0.9770370370370368,
"calib/mu_w": 0.9738793103448277,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4377290836653387,
"calib/std_conf": 0.00905741646755723,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.644745508982036,
"calib/step_q_c_n": 668.0,
"calib/step_q_gap": 0.0755895719291032,
"calib/step_q_w": 0.5691559370529328,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2262.0,
"completions/max_terminated_length": 2262.0,
"completions/mean_length": 502.2421875,
"completions/mean_terminated_length": 506.19683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.0416,
"grad_norm": 0.0513564832508564,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0513,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03278471902012825,
"mask/share_reasoning": 0.8430602550506592,
"mask/share_step_conf": 0.11634252965450287,
"num_tokens": 9371132.0,
"reward": 0.8304038047790527,
"reward_std": 0.20517049729824066,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5503504276275635,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8088946342468262,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7009073495864868,
"adv/mean_abs_reasoning": 0.5143710374832153,
"adv/mean_abs_step_conf": 0.7778093814849854,
"adv/ratio_final_to_reasoning": 1.3626493299777953,
"adv/ratio_step_to_reasoning": 1.5121562545409963,
"adv/std_final_conf": 0.8636792898178101,
"adv/std_reasoning": 0.7575141787528992,
"adv/std_step_conf": 0.9339827299118042,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5441338390017127,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.5020312500000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0015904086126746764,
"calib/mean_conf": 0.9785937500000002,
"calib/mu_c": 0.9794262295081969,
"calib/mu_w": 0.9778358208955222,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5020312500000002,
"calib/std_conf": 0.009373697826231661,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5497952218430033,
"calib/step_q_c_n": 586.0,
"calib/step_q_gap": 0.02492171984566649,
"calib/step_q_w": 0.5248735019973368,
"calib/step_q_w_n": 751.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1553.0,
"completions/max_terminated_length": 1553.0,
"completions/mean_length": 531.94140625,
"completions/mean_terminated_length": 534.0274658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.05533498525619507,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0268,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.032788656651973724,
"mask/share_reasoning": 0.8522244691848755,
"mask/share_step_conf": 0.11108061671257019,
"num_tokens": 9614069.0,
"reward": 0.7950161099433899,
"reward_std": 0.20596131682395935,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.49921953678131104,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.7955002188682556,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.5990781784057617,
"adv/mean_abs_reasoning": 0.3729305863380432,
"adv/mean_abs_step_conf": 0.7478874921798706,
"adv/ratio_final_to_reasoning": 1.606406662130756,
"adv/ratio_step_to_reasoning": 2.0054335031183186,
"adv/std_final_conf": 0.7746619582176208,
"adv/std_reasoning": 0.6612075567245483,
"adv/std_step_conf": 0.9326661229133606,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7066062176165804,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.21948616600790516,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.008082037996545943,
"calib/mean_conf": 0.9823320158102767,
"calib/mu_c": 0.9842487046632125,
"calib/mu_w": 0.9761666666666665,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21948616600790516,
"calib/std_conf": 0.009642648733238994,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5260993852459016,
"calib/step_q_c_n": 976.0,
"calib/step_q_gap": 0.0586591122083589,
"calib/step_q_w": 0.4674402730375427,
"calib/step_q_w_n": 293.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1543.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 469.05859375,
"completions/mean_terminated_length": 472.751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.0854325145483017,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0355,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034799594432115555,
"mask/share_reasoning": 0.837127149105072,
"mask/share_step_conf": 0.12026076018810272,
"num_tokens": 9841396.0,
"reward": 0.973730742931366,
"reward_std": 0.15697798132896423,
"rewards/accuracy_reward_step": 0.75390625,
"rewards/final_brier_reward_step": 0.7607718706130981,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8390333652496338,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.47649919986724854,
"adv/mean_abs_reasoning": 0.34694188833236694,
"adv/mean_abs_step_conf": 0.7612089514732361,
"adv/ratio_final_to_reasoning": 1.3734265474763512,
"adv/ratio_step_to_reasoning": 2.1940531745305005,
"adv/std_final_conf": 0.7270616292953491,
"adv/std_reasoning": 0.6401383876800537,
"adv/std_step_conf": 0.9325536489486694,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5395939397656874,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.4519140625000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0014347052689687168,
"calib/mean_conf": 0.9870703125000001,
"calib/mu_c": 0.9877372262773723,
"calib/mu_w": 0.9863025210084035,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4519140625000001,
"calib/std_conf": 0.006988476668941797,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46700947225981054,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.05923169448203286,
"calib/step_q_w": 0.4077777777777777,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 989.0,
"completions/max_terminated_length": 989.0,
"completions/mean_length": 442.296875,
"completions/mean_terminated_length": 444.0314025878906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0448,
"grad_norm": 0.04096578061580658,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0075,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03530528023838997,
"mask/share_reasoning": 0.8296844959259033,
"mask/share_step_conf": 0.1311040222644806,
"num_tokens": 10058992.0,
"reward": 0.8483582735061646,
"reward_std": 0.14226624369621277,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5476745963096619,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8420105576515198,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.6245371103286743,
"adv/mean_abs_reasoning": 0.5628688335418701,
"adv/mean_abs_step_conf": 0.7604486346244812,
"adv/ratio_final_to_reasoning": 1.1095606526990571,
"adv/ratio_step_to_reasoning": 1.3510228126139687,
"adv/std_final_conf": 0.8431703448295593,
"adv/std_reasoning": 0.8098235726356506,
"adv/std_step_conf": 0.9338532090187073,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5481167608286253,
"calib/avg_num_step_conf": 5.484375,
"calib/ece": 0.45549407114624507,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0016315128688011438,
"calib/mean_conf": 0.9890909090909091,
"calib/mu_c": 0.9898518518518519,
"calib/mu_w": 0.9882203389830507,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45549407114624507,
"calib/std_conf": 0.003921617946185762,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4100944669365722,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": 0.02415178217035796,
"calib/step_q_w": 0.38594268476621424,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2721.0,
"completions/max_terminated_length": 2721.0,
"completions/mean_length": 527.6796875,
"completions/mean_terminated_length": 529.7490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.045065395534038544,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0102,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03222114220261574,
"mask/share_reasoning": 0.8443077206611633,
"mask/share_step_conf": 0.11956489086151123,
"num_tokens": 10299302.0,
"reward": 0.8342925310134888,
"reward_std": 0.21557238698005676,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5379925966262817,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8282486200332642,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.554121732711792,
"adv/mean_abs_reasoning": 0.42493686079978943,
"adv/mean_abs_step_conf": 0.7594295740127563,
"adv/ratio_final_to_reasoning": 1.3040095690189335,
"adv/ratio_step_to_reasoning": 1.7871586206557977,
"adv/std_final_conf": 0.7965189218521118,
"adv/std_reasoning": 0.7013244032859802,
"adv/std_step_conf": 0.9335551261901855,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5689793154395469,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.501921568627451,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.003040507264220671,
"calib/mean_conf": 0.9881960784313726,
"calib/mu_c": 0.989758064516129,
"calib/mu_w": 0.9867175572519084,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.501921568627451,
"calib/std_conf": 0.007186642314638623,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40267929634641403,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.05079895446607208,
"calib/step_q_w": 0.35188034188034195,
"calib/step_q_w_n": 819.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 560.40234375,
"completions/mean_terminated_length": 560.40234375,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.038281019777059555,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0563,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.028894424438476562,
"mask/share_reasoning": 0.8506920337677002,
"mask/share_step_conf": 0.12041356414556503,
"num_tokens": 10549085.0,
"reward": 0.8303428888320923,
"reward_std": 0.16250118613243103,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.4977785050868988,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8668135404586792,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.5395326018333435,
"adv/mean_abs_reasoning": 0.43404123187065125,
"adv/mean_abs_step_conf": 0.7650111317634583,
"adv/ratio_final_to_reasoning": 1.243044582442181,
"adv/ratio_step_to_reasoning": 1.7625310122413427,
"adv/std_final_conf": 0.7722030282020569,
"adv/std_reasoning": 0.681717038154602,
"adv/std_step_conf": 0.9333689212799072,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5348484848484848,
"calib/avg_num_step_conf": 6.06640625,
"calib/ece": 0.4596428571428573,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9841269841269841,
"calib/gap": 0.007606060606060505,
"calib/mean_conf": 0.9811507936507936,
"calib/mu_c": 0.9847727272727274,
"calib/mu_w": 0.9771666666666668,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4584920634920636,
"calib/std_conf": 0.06814917897237072,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.40513907284768214,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.049236817208584416,
"calib/step_q_w": 0.3559022556390977,
"calib/step_q_w_n": 798.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2267.0,
"completions/max_terminated_length": 2267.0,
"completions/mean_length": 521.2734375,
"completions/mean_terminated_length": 523.3176879882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.048,
"grad_norm": 0.04909267649054527,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0485,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03378835693001747,
"mask/share_reasoning": 0.8306214809417725,
"mask/share_step_conf": 0.13168391585350037,
"num_tokens": 10787579.0,
"reward": 0.8356795310974121,
"reward_std": 0.16655045747756958,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5321097373962402,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8392492532730103,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.4849938750267029,
"adv/mean_abs_reasoning": 0.3777017593383789,
"adv/mean_abs_step_conf": 0.7518752813339233,
"adv/ratio_final_to_reasoning": 1.2840657027287028,
"adv/ratio_step_to_reasoning": 1.9906586684980898,
"adv/std_final_conf": 0.728912353515625,
"adv/std_reasoning": 0.6612749099731445,
"adv/std_step_conf": 0.9323993921279907,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5283769902413971,
"calib/avg_num_step_conf": 6.5703125,
"calib/ece": 0.46016,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0010092449922959323,
"calib/mean_conf": 0.98816,
"calib/mu_c": 0.9886363636363639,
"calib/mu_w": 0.9876271186440679,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46016,
"calib/std_conf": 0.005423504402137058,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38947630922693266,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": 0.05152176377238721,
"calib/step_q_w": 0.33795454545454545,
"calib/step_q_w_n": 880.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 558.0078125,
"completions/mean_terminated_length": 562.4015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.048358555883169174,
"learning_rate": 4.277777777777778e-06,
"loss": -0.067,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03367871418595314,
"mask/share_reasoning": 0.8230705261230469,
"mask/share_step_conf": 0.1354382336139679,
"num_tokens": 11035197.0,
"reward": 0.8295394778251648,
"reward_std": 0.1529095470905304,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5268656015396118,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8337757587432861,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.5534183979034424,
"adv/mean_abs_reasoning": 0.4260653257369995,
"adv/mean_abs_step_conf": 0.7502398490905762,
"adv/ratio_final_to_reasoning": 1.2989050374990032,
"adv/ratio_step_to_reasoning": 1.7608563846232403,
"adv/std_final_conf": 0.7891755700111389,
"adv/std_reasoning": 0.7013570070266724,
"adv/std_step_conf": 0.9326286315917969,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6437258687258687,
"calib/avg_num_step_conf": 6.5390625,
"calib/ece": 0.39592885375494075,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.01943307593307575,
"calib/mean_conf": 0.9772727272727274,
"calib/mu_c": 0.9853378378378377,
"calib/mu_w": 0.9659047619047619,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.39411067193675897,
"calib/std_conf": 0.0814157869391294,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.37110948081264106,
"calib/step_q_c_n": 886.0,
"calib/step_q_gap": 0.048685622944620754,
"calib/step_q_w": 0.3224238578680203,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 579.1171875,
"completions/mean_terminated_length": 579.1171875,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.04179922118782997,
"learning_rate": 4.25e-06,
"loss": 0.0534,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030462728813290596,
"mask/share_reasoning": 0.8413854837417603,
"mask/share_step_conf": 0.1281517744064331,
"num_tokens": 11289427.0,
"reward": 0.8785368800163269,
"reward_std": 0.16066467761993408,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5990324020385742,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8447600603103638,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7094321250915527,
"adv/mean_abs_reasoning": 0.4535606801509857,
"adv/mean_abs_step_conf": 0.7399888038635254,
"adv/ratio_final_to_reasoning": 1.5641393889245205,
"adv/ratio_step_to_reasoning": 1.6315100409876595,
"adv/std_final_conf": 0.8601517081260681,
"adv/std_reasoning": 0.7014860510826111,
"adv/std_step_conf": 0.9333368539810181,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6279370078740159,
"calib/avg_num_step_conf": 6.4140625,
"calib/ece": 0.459920634920635,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9126984126984127,
"calib/gap": 0.03109165354330712,
"calib/mean_conf": 0.9486507936507936,
"calib/mu_c": 0.9643200000000001,
"calib/mu_w": 0.933228346456693,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4562698412698414,
"calib/std_conf": 0.12007836842188596,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.36418791946308726,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.005369636296978009,
"calib/step_q_w": 0.35881828316610925,
"calib/step_q_w_n": 897.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2258.0,
"completions/max_terminated_length": 2258.0,
"completions/mean_length": 506.65234375,
"completions/mean_terminated_length": 512.6600952148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0512,
"grad_norm": 0.05462892726063728,
"learning_rate": 4.222222222222223e-06,
"loss": -0.1157,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03352171555161476,
"mask/share_reasoning": 0.8146562576293945,
"mask/share_step_conf": 0.14010323584079742,
"num_tokens": 11522818.0,
"reward": 0.8315355777740479,
"reward_std": 0.1631363332271576,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5375875234603882,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8317336440086365,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.6704999208450317,
"adv/mean_abs_reasoning": 0.4311797022819519,
"adv/mean_abs_step_conf": 0.7662805318832397,
"adv/ratio_final_to_reasoning": 1.555035910309587,
"adv/ratio_step_to_reasoning": 1.7771720881753443,
"adv/std_final_conf": 0.840907096862793,
"adv/std_reasoning": 0.7014207243919373,
"adv/std_step_conf": 0.9324670433998108,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6658229491173417,
"calib/avg_num_step_conf": 6.609375,
"calib/ece": 0.38549800796812755,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9362549800796812,
"calib/gap": 0.037870586708203424,
"calib/mean_conf": 0.959203187250996,
"calib/mu_c": 0.9753472222222221,
"calib/mu_w": 0.9374766355140187,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.38549800796812755,
"calib/std_conf": 0.07276663122148436,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3687181510710259,
"calib/step_q_c_n": 887.0,
"calib/step_q_gap": 0.037103244238727795,
"calib/step_q_w": 0.3316149068322981,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 518.890625,
"completions/mean_terminated_length": 520.925537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.07325834035873413,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0203,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03269638121128082,
"mask/share_reasoning": 0.8214113116264343,
"mask/share_step_conf": 0.14198604226112366,
"num_tokens": 11760190.0,
"reward": 0.877612292766571,
"reward_std": 0.1599796712398529,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6075612902641296,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8398506045341492,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7164367437362671,
"adv/mean_abs_reasoning": 0.42041462659835815,
"adv/mean_abs_step_conf": 0.7300196290016174,
"adv/ratio_final_to_reasoning": 1.7041194535334585,
"adv/ratio_step_to_reasoning": 1.7364277615846118,
"adv/std_final_conf": 0.8789020776748657,
"adv/std_reasoning": 0.6817552447319031,
"adv/std_step_conf": 0.9324972629547119,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.723894389438944,
"calib/avg_num_step_conf": 6.44921875,
"calib/ece": 0.3033067729083666,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7569721115537849,
"calib/gap": 0.09033597359735956,
"calib/mean_conf": 0.9009163346613547,
"calib/mu_c": 0.9372666666666665,
"calib/mu_w": 0.8469306930693069,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3033067729083666,
"calib/std_conf": 0.13599990102861245,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35842902208201893,
"calib/step_q_c_n": 951.0,
"calib/step_q_gap": 0.007071879224876065,
"calib/step_q_w": 0.35135714285714287,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2378.0,
"completions/max_terminated_length": 2378.0,
"completions/mean_length": 553.734375,
"completions/mean_terminated_length": 553.734375,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.07642224431037903,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0541,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031201042234897614,
"mask/share_reasoning": 0.8356307744979858,
"mask/share_step_conf": 0.13316819071769714,
"num_tokens": 12007306.0,
"reward": 0.916384220123291,
"reward_std": 0.16534239053726196,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6789581775665283,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8405289053916931,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7365193367004395,
"adv/mean_abs_reasoning": 0.38996899127960205,
"adv/mean_abs_step_conf": 0.7523127794265747,
"adv/ratio_final_to_reasoning": 1.8886612863338303,
"adv/ratio_step_to_reasoning": 1.929160513398814,
"adv/std_final_conf": 0.9118251800537109,
"adv/std_reasoning": 0.6815586686134338,
"adv/std_step_conf": 0.9332376718521118,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6966089466089466,
"calib/avg_num_step_conf": 6.86328125,
"calib/ece": 0.21509881422924906,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.48221343873517786,
"calib/gap": 0.10533910533910529,
"calib/mean_conf": 0.8164426877470355,
"calib/mu_c": 0.8576623376623378,
"calib/mu_w": 0.7523232323232325,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2114229249011858,
"calib/std_conf": 0.17825015085694332,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3530892678034102,
"calib/step_q_c_n": 997.0,
"calib/step_q_gap": 0.016365583592883926,
"calib/step_q_w": 0.3367236842105263,
"calib/step_q_w_n": 760.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 562.5859375,
"completions/mean_terminated_length": 564.7921752929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.0544,
"grad_norm": 0.14050939679145813,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0199,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030356088653206825,
"mask/share_reasoning": 0.8331259489059448,
"mask/share_step_conf": 0.1326117217540741,
"num_tokens": 12260624.0,
"reward": 0.9410929679870605,
"reward_std": 0.12373049557209015,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7284257411956787,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8357914686203003,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.6990074515342712,
"adv/mean_abs_reasoning": 0.37972062826156616,
"adv/mean_abs_step_conf": 0.7402435541152954,
"adv/ratio_final_to_reasoning": 1.840846663333676,
"adv/ratio_step_to_reasoning": 1.9494425612437025,
"adv/std_final_conf": 0.8831400871276855,
"adv/std_reasoning": 0.6612834930419922,
"adv/std_step_conf": 0.9324589371681213,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7261456628477906,
"calib/avg_num_step_conf": 6.12109375,
"calib/ece": 0.0904347826086957,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4980237154150198,
"calib/gap": 0.17239607201309315,
"calib/mean_conf": 0.7810276679841898,
"calib/mu_c": 0.8253191489361702,
"calib/mu_w": 0.652923076923077,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06418972332015814,
"calib/std_conf": 0.22313020008731313,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37573489630297563,
"calib/step_q_c_n": 1109.0,
"calib/step_q_gap": 0.024424852634853333,
"calib/step_q_w": 0.3513100436681223,
"calib/step_q_w_n": 458.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2378.0,
"completions/max_terminated_length": 2378.0,
"completions/mean_length": 527.47265625,
"completions/mean_terminated_length": 527.47265625,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.1133529543876648,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0067,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03268003463745117,
"mask/share_reasoning": 0.8374246954917908,
"mask/share_step_conf": 0.12989526987075806,
"num_tokens": 12503609.0,
"reward": 0.9939903616905212,
"reward_std": 0.13481810688972473,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.8108843564987183,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8325650691986084,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.6250590085983276,
"adv/mean_abs_reasoning": 0.4454294443130493,
"adv/mean_abs_step_conf": 0.7550910711288452,
"adv/ratio_final_to_reasoning": 1.4032727664923608,
"adv/ratio_step_to_reasoning": 1.695197928132844,
"adv/std_final_conf": 0.8168118596076965,
"adv/std_reasoning": 0.7014402151107788,
"adv/std_step_conf": 0.9312266111373901,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7757667331061533,
"calib/avg_num_step_conf": 6.66015625,
"calib/ece": 0.28917322834645665,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7125984251968503,
"calib/gap": 0.14546454060309888,
"calib/mean_conf": 0.8836614173228348,
"calib/mu_c": 0.9426490066225164,
"calib/mu_w": 0.7971844660194175,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28917322834645665,
"calib/std_conf": 0.17132984812295177,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3879324055666004,
"calib/step_q_c_n": 1006.0,
"calib/step_q_gap": 0.039262877669604734,
"calib/step_q_w": 0.3486695278969957,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3036.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 528.0078125,
"completions/mean_terminated_length": 528.0078125,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.07778654247522354,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0237,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03123798780143261,
"mask/share_reasoning": 0.8346073627471924,
"mask/share_step_conf": 0.13415467739105225,
"num_tokens": 12744603.0,
"reward": 0.93428635597229,
"reward_std": 0.14244474470615387,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7104933261871338,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8416730165481567,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.41671222448349,
"adv/mean_abs_reasoning": 0.2910095453262329,
"adv/mean_abs_step_conf": 0.7577744722366333,
"adv/ratio_final_to_reasoning": 1.4319538007467747,
"adv/ratio_step_to_reasoning": 2.6039505727798,
"adv/std_final_conf": 0.6897390484809875,
"adv/std_reasoning": 0.5959850549697876,
"adv/std_step_conf": 0.9312926530838013,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7402930402930403,
"calib/avg_num_step_conf": 6.12109375,
"calib/ece": 0.20330708661417324,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.905511811023622,
"calib/gap": 0.13131298331298313,
"calib/mean_conf": 0.9474015748031497,
"calib/mu_c": 0.981005291005291,
"calib/mu_w": 0.8496923076923079,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20330708661417324,
"calib/std_conf": 0.13252967338761779,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4237527372262774,
"calib/step_q_c_n": 1096.0,
"calib/step_q_gap": 0.05451706843646842,
"calib/step_q_w": 0.36923566878980896,
"calib/step_q_w_n": 471.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1939.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 462.609375,
"completions/mean_terminated_length": 464.4235534667969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.0576,
"grad_norm": 0.05564220994710922,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0125,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036595359444618225,
"mask/share_reasoning": 0.818428635597229,
"mask/share_step_conf": 0.14106974005699158,
"num_tokens": 12969263.0,
"reward": 0.9893507957458496,
"reward_std": 0.10805167257785797,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7944375276565552,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8381702899932861,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.5934556722640991,
"adv/mean_abs_reasoning": 0.49654126167297363,
"adv/mean_abs_step_conf": 0.7678463459014893,
"adv/ratio_final_to_reasoning": 1.19517896713074,
"adv/ratio_step_to_reasoning": 1.5463898071922964,
"adv/std_final_conf": 0.8131335377693176,
"adv/std_reasoning": 0.739345371723175,
"adv/std_step_conf": 0.9309093952178955,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6096491228070176,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.3576377952755906,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9015748031496063,
"calib/gap": 0.05930727554179582,
"calib/mean_conf": 0.9451968503937008,
"calib/mu_c": 0.969013157894737,
"calib/mu_w": 0.9097058823529411,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35220472440944883,
"calib/std_conf": 0.13621630779669894,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43842369263607256,
"calib/step_q_c_n": 937.0,
"calib/step_q_gap": 0.025516203649288416,
"calib/step_q_w": 0.41290748898678414,
"calib/step_q_w_n": 681.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2064.0,
"completions/max_terminated_length": 2064.0,
"completions/mean_length": 504.35546875,
"completions/mean_terminated_length": 506.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.057675618678331375,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0774,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03339093178510666,
"mask/share_reasoning": 0.8275409936904907,
"mask/share_step_conf": 0.13516178727149963,
"num_tokens": 13206202.0,
"reward": 0.9117587804794312,
"reward_std": 0.19418179988861084,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6443132758140564,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8620166778564453,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.49654895067214966,
"adv/mean_abs_reasoning": 0.4492354691028595,
"adv/mean_abs_step_conf": 0.7373233437538147,
"adv/ratio_final_to_reasoning": 1.1053200043704852,
"adv/ratio_step_to_reasoning": 1.6412847926417693,
"adv/std_final_conf": 0.7745915651321411,
"adv/std_reasoning": 0.7391836047172546,
"adv/std_step_conf": 0.9308094382286072,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6026096505393708,
"calib/avg_num_step_conf": 6.95703125,
"calib/ece": 0.4257600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.936,
"calib/gap": 0.008552419094373875,
"calib/mean_conf": 0.9692000000000001,
"calib/mu_c": 0.9730656934306572,
"calib/mu_w": 0.9645132743362833,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4234800000000001,
"calib/std_conf": 0.07442687686582046,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4720138089758343,
"calib/step_q_c_n": 869.0,
"calib/step_q_gap": 0.04145459844951854,
"calib/step_q_w": 0.43055921052631574,
"calib/step_q_w_n": 912.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2246.0,
"completions/max_terminated_length": 2246.0,
"completions/mean_length": 494.6875,
"completions/mean_terminated_length": 502.5397033691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.03915965557098389,
"learning_rate": 4.000000000000001e-06,
"loss": -0.098,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03251350671052933,
"mask/share_reasoning": 0.8159323930740356,
"mask/share_step_conf": 0.13592907786369324,
"num_tokens": 13439682.0,
"reward": 0.8552824854850769,
"reward_std": 0.1606539785861969,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5601484775543213,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8480727672576904,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.5040005445480347,
"adv/mean_abs_reasoning": 0.4262211322784424,
"adv/mean_abs_step_conf": 0.7493640184402466,
"adv/ratio_final_to_reasoning": 1.1824860533165222,
"adv/ratio_step_to_reasoning": 1.7581578239314068,
"adv/std_final_conf": 0.7608724236488342,
"adv/std_reasoning": 0.7013793587684631,
"adv/std_step_conf": 0.9318333268165588,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5604005167958657,
"calib/avg_num_step_conf": 6.65625,
"calib/ece": 0.2947430830039526,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9604743083003953,
"calib/gap": 0.026538185472293807,
"calib/mean_conf": 0.9745849802371542,
"calib/mu_c": 0.9830813953488371,
"calib/mu_w": 0.9565432098765433,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2947430830039526,
"calib/std_conf": 0.07132515931620358,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48574823943661966,
"calib/step_q_c_n": 1136.0,
"calib/step_q_gap": 0.05451584507042245,
"calib/step_q_w": 0.4312323943661972,
"calib/step_q_w_n": 568.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2502.0,
"completions/max_terminated_length": 2502.0,
"completions/mean_length": 516.07421875,
"completions/mean_terminated_length": 516.07421875,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.0608,
"grad_norm": 0.03392103314399719,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0328,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03265305608510971,
"mask/share_reasoning": 0.8259487152099609,
"mask/share_step_conf": 0.14139823615550995,
"num_tokens": 13678589.0,
"reward": 0.9377506971359253,
"reward_std": 0.17583677172660828,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6937090158462524,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8489797115325928,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.5565872192382812,
"adv/mean_abs_reasoning": 0.49917328357696533,
"adv/mean_abs_step_conf": 0.7326595783233643,
"adv/ratio_final_to_reasoning": 1.115018045937676,
"adv/ratio_step_to_reasoning": 1.467745976053221,
"adv/std_final_conf": 0.7942405343055725,
"adv/std_reasoning": 0.7575743198394775,
"adv/std_step_conf": 0.9322904348373413,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5544391315585422,
"calib/avg_num_step_conf": 7.35546875,
"calib/ece": 0.381984126984127,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9206349206349206,
"calib/gap": 0.023688291548203844,
"calib/mean_conf": 0.9547619047619048,
"calib/mu_c": 0.9647260273972604,
"calib/mu_w": 0.9410377358490566,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3786904761904762,
"calib/std_conf": 0.13093506379005287,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49481312670920685,
"calib/step_q_c_n": 1097.0,
"calib/step_q_gap": 0.05720498421556813,
"calib/step_q_w": 0.4376081424936387,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2708.0,
"completions/max_terminated_length": 2708.0,
"completions/mean_length": 579.21484375,
"completions/mean_terminated_length": 581.486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.04124533385038376,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030935410410165787,
"mask/share_reasoning": 0.827335000038147,
"mask/share_step_conf": 0.13782331347465515,
"num_tokens": 13933188.0,
"reward": 0.871442437171936,
"reward_std": 0.19315657019615173,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5968499779701233,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8358785510063171,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.47267991304397583,
"adv/mean_abs_reasoning": 0.4489648938179016,
"adv/mean_abs_step_conf": 0.7724525332450867,
"adv/ratio_final_to_reasoning": 1.0528215447412976,
"adv/ratio_step_to_reasoning": 1.7205187841666534,
"adv/std_final_conf": 0.7284379601478577,
"adv/std_reasoning": 0.7206330895423889,
"adv/std_step_conf": 0.9324396848678589,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5234210526315789,
"calib/avg_num_step_conf": 6.140625,
"calib/ece": 0.38547619047619064,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9880952380952381,
"calib/gap": 0.0003500000000001835,
"calib/mean_conf": 0.9861111111111113,
"calib/mu_c": 0.9862500000000002,
"calib/mu_w": 0.9859,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3842063492063494,
"calib/std_conf": 0.02774521901392222,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5309918319719953,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.008837985818149097,
"calib/step_q_w": 0.5221538461538462,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2661.0,
"completions/max_terminated_length": 2661.0,
"completions/mean_length": 531.265625,
"completions/mean_terminated_length": 533.3490600585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.037306271493434906,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0795,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03339418023824692,
"mask/share_reasoning": 0.836950421333313,
"mask/share_step_conf": 0.12574917078018188,
"num_tokens": 14175440.0,
"reward": 0.8658956289291382,
"reward_std": 0.1877882331609726,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.5960062742233276,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8217225074768066,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.5334903597831726,
"adv/mean_abs_reasoning": 0.5123406648635864,
"adv/mean_abs_step_conf": 0.735711932182312,
"adv/ratio_final_to_reasoning": 1.0412805314315963,
"adv/ratio_step_to_reasoning": 1.4359819210879923,
"adv/std_final_conf": 0.7582880854606628,
"adv/std_reasoning": 0.7577045559883118,
"adv/std_step_conf": 0.932331919670105,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5634365634365635,
"calib/avg_num_step_conf": 6.55078125,
"calib/ece": 0.4020564516129033,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.967741935483871,
"calib/gap": 0.026596736596736337,
"calib/mean_conf": 0.9786693548387099,
"calib/mu_c": 0.9899300699300697,
"calib/mu_w": 0.9633333333333334,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4020564516129033,
"calib/std_conf": 0.06951041019437462,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5287763157894737,
"calib/step_q_c_n": 760.0,
"calib/step_q_gap": 0.06046661022785976,
"calib/step_q_w": 0.46830970556161394,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2579.0,
"completions/max_terminated_length": 2579.0,
"completions/mean_length": 496.81640625,
"completions/mean_terminated_length": 502.70751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.064,
"grad_norm": 0.03891824930906296,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0029,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034683212637901306,
"mask/share_reasoning": 0.8176734447479248,
"mask/share_step_conf": 0.1359245479106903,
"num_tokens": 14411481.0,
"reward": 0.86177659034729,
"reward_std": 0.2099033147096634,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5835503935813904,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8345340490341187,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.3984984755516052,
"adv/mean_abs_reasoning": 0.3724913001060486,
"adv/mean_abs_step_conf": 0.7677532434463501,
"adv/ratio_final_to_reasoning": 1.0698195513241582,
"adv/ratio_step_to_reasoning": 2.0611306713143906,
"adv/std_final_conf": 0.6604509353637695,
"adv/std_reasoning": 0.64032381772995,
"adv/std_step_conf": 0.9302454590797424,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5178571428571428,
"calib/avg_num_step_conf": 6.25,
"calib/ece": 0.32440000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.996,
"calib/gap": 0.004761904761904745,
"calib/mean_conf": 0.9884000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9852380952380951,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.32440000000000013,
"calib/std_conf": 0.022800000000000004,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5489644351464434,
"calib/step_q_c_n": 956.0,
"calib/step_q_gap": 0.09121598794147445,
"calib/step_q_w": 0.457748447204969,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2342.0,
"completions/max_terminated_length": 2342.0,
"completions/mean_length": 456.85546875,
"completions/mean_terminated_length": 456.85546875,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.04500626400113106,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0342,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03986175358295441,
"mask/share_reasoning": 0.8108029365539551,
"mask/share_step_conf": 0.14933526515960693,
"num_tokens": 14632500.0,
"reward": 0.9083384275436401,
"reward_std": 0.1528836041688919,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6574859619140625,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.834190845489502,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.5679522752761841,
"adv/mean_abs_reasoning": 0.5531355738639832,
"adv/mean_abs_step_conf": 0.7585821151733398,
"adv/ratio_final_to_reasoning": 1.0267867447191967,
"adv/ratio_step_to_reasoning": 1.371421674932584,
"adv/std_final_conf": 0.7788091897964478,
"adv/std_reasoning": 0.7755100727081299,
"adv/std_step_conf": 0.9326937794685364,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49162539103232533,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.4342570281124499,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9357429718875502,
"calib/gap": -0.0028480187695517634,
"calib/mean_conf": 0.9600401606425704,
"calib/mu_c": 0.9587591240875912,
"calib/mu_w": 0.961607142857143,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42204819277108446,
"calib/std_conf": 0.11564902585972572,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5245710784313725,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.06565829467987905,
"calib/step_q_w": 0.45891278375149347,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2756.0,
"completions/max_terminated_length": 2756.0,
"completions/mean_length": 512.97265625,
"completions/mean_terminated_length": 517.0117797851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.045611463487148285,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0846,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03405851125717163,
"mask/share_reasoning": 0.8291573524475098,
"mask/share_step_conf": 0.12897160649299622,
"num_tokens": 14870901.0,
"reward": 0.8375222086906433,
"reward_std": 0.21086472272872925,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5541878938674927,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8192939162254333,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.4292864501476288,
"adv/mean_abs_reasoning": 0.41687119007110596,
"adv/mean_abs_step_conf": 0.7617143392562866,
"adv/ratio_final_to_reasoning": 1.0297820055024793,
"adv/ratio_step_to_reasoning": 1.8272175132235944,
"adv/std_final_conf": 0.7025664448738098,
"adv/std_reasoning": 0.6816080808639526,
"adv/std_step_conf": 0.9308570027351379,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5537826595606395,
"calib/avg_num_step_conf": 6.74609375,
"calib/ece": 0.3613333333333332,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9686274509803922,
"calib/gap": 0.022671259586637182,
"calib/mean_conf": 0.9770196078431372,
"calib/mu_c": 0.9857324840764331,
"calib/mu_w": 0.9630612244897959,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3613333333333332,
"calib/std_conf": 0.07232104084473495,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5041708043694141,
"calib/step_q_c_n": 1007.0,
"calib/step_q_gap": 0.03707358214719175,
"calib/step_q_w": 0.46709722222222233,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2460.0,
"completions/max_terminated_length": 2460.0,
"completions/mean_length": 576.10546875,
"completions/mean_terminated_length": 576.10546875,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.0672,
"grad_norm": 0.039514731615781784,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0053,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03206784278154373,
"mask/share_reasoning": 0.8391179442405701,
"mask/share_step_conf": 0.12881425023078918,
"num_tokens": 15127024.0,
"reward": 0.9113430976867676,
"reward_std": 0.15366439521312714,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6358265280723572,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8649846315383911,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.5828859210014343,
"adv/mean_abs_reasoning": 0.5493597984313965,
"adv/mean_abs_step_conf": 0.744002103805542,
"adv/ratio_final_to_reasoning": 1.0610276228179891,
"adv/ratio_step_to_reasoning": 1.3543075156389557,
"adv/std_final_conf": 0.8123847842216492,
"adv/std_reasoning": 0.7929263710975647,
"adv/std_step_conf": 0.9302653670310974,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5185117569738572,
"calib/avg_num_step_conf": 6.1796875,
"calib/ece": 0.3153815261044177,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.005405286987001601,
"calib/mean_conf": 0.9860642570281126,
"calib/mu_c": 0.9878443113772454,
"calib/mu_w": 0.9824390243902438,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3153815261044177,
"calib/std_conf": 0.03338608106961565,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48400000000000004,
"calib/step_q_c_n": 1015.0,
"calib/step_q_gap": 0.022465608465608522,
"calib/step_q_w": 0.4615343915343915,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2668.0,
"completions/max_terminated_length": 2668.0,
"completions/mean_length": 505.39453125,
"completions/mean_terminated_length": 509.3740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.08964262902736664,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0616,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0351257249712944,
"mask/share_reasoning": 0.8196390867233276,
"mask/share_step_conf": 0.13742271065711975,
"num_tokens": 15360181.0,
"reward": 0.9109358787536621,
"reward_std": 0.21645523607730865,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6623206734657288,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8345509767532349,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.2017216831445694,
"adv/mean_abs_reasoning": 0.19740091264247894,
"adv/mean_abs_step_conf": 0.7495631575584412,
"adv/ratio_final_to_reasoning": 1.021888300536462,
"adv/ratio_step_to_reasoning": 3.797161560828274,
"adv/std_final_conf": 0.4967176914215088,
"adv/std_reasoning": 0.4958440661430359,
"adv/std_step_conf": 0.9305339455604553,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5045454545454545,
"calib/avg_num_step_conf": 5.8359375,
"calib/ece": 0.41956862745098045,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": 0.0041818181818185085,
"calib/mean_conf": 0.9881960784313726,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9858181818181816,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41956862745098045,
"calib/std_conf": 0.028749778930330372,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5030480656506449,
"calib/step_q_c_n": 853.0,
"calib/step_q_gap": 0.05794666159448253,
"calib/step_q_w": 0.44510140405616233,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1432.0,
"completions/max_terminated_length": 1432.0,
"completions/mean_length": 418.296875,
"completions/mean_terminated_length": 419.9372863769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.03971258923411369,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0452,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03916507214307785,
"mask/share_reasoning": 0.8115283250808716,
"mask/share_step_conf": 0.14540034532546997,
"num_tokens": 15572289.0,
"reward": 0.8780311346054077,
"reward_std": 0.0837891697883606,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5776315927505493,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8659305572509766,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.4690343737602234,
"adv/mean_abs_reasoning": 0.43525969982147217,
"adv/mean_abs_step_conf": 0.7657510042190552,
"adv/ratio_final_to_reasoning": 1.0775966025630317,
"adv/ratio_step_to_reasoning": 1.7592968164365748,
"adv/std_final_conf": 0.7395740747451782,
"adv/std_reasoning": 0.7205032706260681,
"adv/std_step_conf": 0.9319112300872803,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.555878084179971,
"calib/avg_num_step_conf": 6.92578125,
"calib/ece": 0.4014457831325302,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.963855421686747,
"calib/gap": 0.026591238949729368,
"calib/mean_conf": 0.9757429718875502,
"calib/mu_c": 0.9870629370629369,
"calib/mu_w": 0.9604716981132075,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4014457831325302,
"calib/std_conf": 0.07292752495723935,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45279886363636374,
"calib/step_q_c_n": 880.0,
"calib/step_q_gap": 0.05453458592079824,
"calib/step_q_w": 0.3982642777155655,
"calib/step_q_w_n": 893.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 577.3984375,
"completions/mean_terminated_length": 579.6627807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.0704,
"grad_norm": 0.045602548867464066,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.005,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03310517594218254,
"mask/share_reasoning": 0.8324896097183228,
"mask/share_step_conf": 0.13049902021884918,
"num_tokens": 15826455.0,
"reward": 0.8676662445068359,
"reward_std": 0.17554137110710144,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5855827927589417,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8434996008872986,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.34227991104125977,
"adv/mean_abs_reasoning": 0.23182302713394165,
"adv/mean_abs_step_conf": 0.7574501037597656,
"adv/ratio_final_to_reasoning": 1.4764707167916449,
"adv/ratio_step_to_reasoning": 3.2673635277919546,
"adv/std_final_conf": 0.621044933795929,
"adv/std_reasoning": 0.5227372050285339,
"adv/std_step_conf": 0.9281366467475891,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5851394211708736,
"calib/avg_num_step_conf": 6.390625,
"calib/ece": 0.3375686274509805,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9450980392156862,
"calib/gap": 0.05679067001453708,
"calib/mean_conf": 0.9689411764705883,
"calib/mu_c": 0.9898757763975156,
"calib/mu_w": 0.9330851063829785,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3375686274509805,
"calib/std_conf": 0.09115842958767131,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46348416289592764,
"calib/step_q_c_n": 884.0,
"calib/step_q_gap": 0.06419825864060846,
"calib/step_q_w": 0.3992859042553192,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2858.0,
"completions/max_terminated_length": 2858.0,
"completions/mean_length": 504.78125,
"completions/mean_terminated_length": 504.78125,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.04864136129617691,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0039,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03638178110122681,
"mask/share_reasoning": 0.8302202820777893,
"mask/share_step_conf": 0.13339796662330627,
"num_tokens": 16060687.0,
"reward": 0.9319067001342773,
"reward_std": 0.10852377116680145,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6688085794448853,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8700047135353088,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.4381498098373413,
"adv/mean_abs_reasoning": 0.43346601724624634,
"adv/mean_abs_step_conf": 0.7439643740653992,
"adv/ratio_final_to_reasoning": 1.01080544357514,
"adv/ratio_step_to_reasoning": 1.7163153383781014,
"adv/std_final_conf": 0.7203687429428101,
"adv/std_reasoning": 0.7204791903495789,
"adv/std_step_conf": 0.9302870631217957,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5455629139072847,
"calib/avg_num_step_conf": 6.4609375,
"calib/ece": 0.36521912350597613,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9561752988047809,
"calib/gap": 0.04423708609271493,
"calib/mean_conf": 0.9668127490039842,
"calib/mu_c": 0.9844370860927151,
"calib/mu_w": 0.9402000000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36521912350597613,
"calib/std_conf": 0.11096193308714032,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4352251184834124,
"calib/step_q_c_n": 844.0,
"calib/step_q_gap": 0.08008931601427655,
"calib/step_q_w": 0.35513580246913584,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 510.81640625,
"completions/mean_terminated_length": 510.81640625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.04457082226872444,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0289,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.036559879779815674,
"mask/share_reasoning": 0.8241531848907471,
"mask/share_step_conf": 0.13928695023059845,
"num_tokens": 16295544.0,
"reward": 0.8986823558807373,
"reward_std": 0.1547134965658188,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6234105229377747,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8591104745864868,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.5651110410690308,
"adv/mean_abs_reasoning": 0.531714677810669,
"adv/mean_abs_step_conf": 0.753547191619873,
"adv/ratio_final_to_reasoning": 1.0628088045187525,
"adv/ratio_step_to_reasoning": 1.417202163240251,
"adv/std_final_conf": 0.7929477691650391,
"adv/std_reasoning": 0.7928342819213867,
"adv/std_step_conf": 0.93269282579422,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5532371794871795,
"calib/avg_num_step_conf": 6.75,
"calib/ece": 0.49424000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.964,
"calib/gap": 0.025339743589743446,
"calib/mean_conf": 0.9742400000000002,
"calib/mu_c": 0.9874166666666667,
"calib/mu_w": 0.9620769230769233,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.49424000000000007,
"calib/std_conf": 0.08760606371707381,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4182626538987688,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.05224459973628126,
"calib/step_q_w": 0.36601805416248756,
"calib/step_q_w_n": 997.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2212.0,
"completions/max_terminated_length": 2212.0,
"completions/mean_length": 570.68359375,
"completions/mean_terminated_length": 582.0518188476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0736,
"grad_norm": 0.04093863442540169,
"learning_rate": 3.638888888888889e-06,
"loss": -0.1506,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031352706253528595,
"mask/share_reasoning": 0.8293837904930115,
"mask/share_step_conf": 0.11973226070404053,
"num_tokens": 16546135.0,
"reward": 0.8139256238937378,
"reward_std": 0.19919061660766602,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.4989679753780365,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8406020402908325,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.5300338268280029,
"adv/mean_abs_reasoning": 0.46458911895751953,
"adv/mean_abs_step_conf": 0.7436332702636719,
"adv/ratio_final_to_reasoning": 1.14086577838356,
"adv/ratio_step_to_reasoning": 1.6006256709849187,
"adv/std_final_conf": 0.7751262784004211,
"adv/std_reasoning": 0.7392660975456238,
"adv/std_step_conf": 0.9322122931480408,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.646482577251808,
"calib/avg_num_step_conf": 6.6171875,
"calib/ece": 0.3817408906882591,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.854251012145749,
"calib/gap": 0.15967521367521398,
"calib/mean_conf": 0.9080566801619432,
"calib/mu_c": 0.9836923076923079,
"calib/mu_w": 0.8240170940170939,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3817408906882591,
"calib/std_conf": 0.22664841243704695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.411068493150685,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.10200210310919117,
"calib/step_q_w": 0.30906639004149383,
"calib/step_q_w_n": 964.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2810.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 535.23046875,
"completions/mean_terminated_length": 543.7261962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.08670484274625778,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.1004,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03494434431195259,
"mask/share_reasoning": 0.8160851001739502,
"mask/share_step_conf": 0.1333456039428711,
"num_tokens": 16790146.0,
"reward": 0.8777591586112976,
"reward_std": 0.1863366961479187,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6109519004821777,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8492538928985596,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.5125067234039307,
"adv/mean_abs_reasoning": 0.44174620509147644,
"adv/mean_abs_step_conf": 0.7183400988578796,
"adv/ratio_final_to_reasoning": 1.160183647299927,
"adv/ratio_step_to_reasoning": 1.6261375662732096,
"adv/std_final_conf": 0.7567806243896484,
"adv/std_reasoning": 0.7014018893241882,
"adv/std_step_conf": 0.9308453798294067,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5626856803327391,
"calib/avg_num_step_conf": 7.6796875,
"calib/ece": 0.32253968253968246,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8452380952380952,
"calib/gap": 0.046179441473559124,
"calib/mean_conf": 0.9123809523809525,
"calib/mu_c": 0.9305228758169934,
"calib/mu_w": 0.8843434343434343,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.31388888888888883,
"calib/std_conf": 0.19549054526493564,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.38961165048543694,
"calib/step_q_c_n": 1030.0,
"calib/step_q_gap": 0.07593109492988137,
"calib/step_q_w": 0.3136805555555556,
"calib/step_q_w_n": 936.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 525.77734375,
"completions/mean_terminated_length": 529.9172973632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.04918983578681946,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0623,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03495267033576965,
"mask/share_reasoning": 0.8089569211006165,
"mask/share_step_conf": 0.14827793836593628,
"num_tokens": 17029153.0,
"reward": 0.9064410924911499,
"reward_std": 0.15928587317466736,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6419327855110168,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8545430302619934,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.49696022272109985,
"adv/mean_abs_reasoning": 0.3554950952529907,
"adv/mean_abs_step_conf": 0.7681937217712402,
"adv/ratio_final_to_reasoning": 1.3979383382699961,
"adv/ratio_step_to_reasoning": 2.1609122939503553,
"adv/std_final_conf": 0.7404859066009521,
"adv/std_reasoning": 0.6185715794563293,
"adv/std_step_conf": 0.9309424757957458,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6303114400246685,
"calib/avg_num_step_conf": 5.97265625,
"calib/ece": 0.3733203124999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.83984375,
"calib/gap": 0.10738390379278451,
"calib/mean_conf": 0.9241015625000001,
"calib/mu_c": 0.9723404255319148,
"calib/mu_w": 0.8649565217391303,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3733203124999999,
"calib/std_conf": 0.1616177173457743,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4070370370370371,
"calib/step_q_c_n": 810.0,
"calib/step_q_gap": 0.06840004120949883,
"calib/step_q_w": 0.33863699582753826,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1103.0,
"completions/max_terminated_length": 1103.0,
"completions/mean_length": 456.06640625,
"completions/mean_terminated_length": 457.85491943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0768,
"grad_norm": 0.0759405568242073,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0303,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03548864275217056,
"mask/share_reasoning": 0.8234069347381592,
"mask/share_step_conf": 0.13719822466373444,
"num_tokens": 17250314.0,
"reward": 0.9098681807518005,
"reward_std": 0.13719965517520905,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6402285099029541,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8693515062332153,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.5311991572380066,
"adv/mean_abs_reasoning": 0.4925178289413452,
"adv/mean_abs_step_conf": 0.7761179208755493,
"adv/ratio_final_to_reasoning": 1.0785379249717841,
"adv/ratio_step_to_reasoning": 1.575816905032241,
"adv/std_final_conf": 0.7763379812240601,
"adv/std_reasoning": 0.7574392557144165,
"adv/std_step_conf": 0.9311206936836243,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6871885382059801,
"calib/avg_num_step_conf": 5.421875,
"calib/ece": 0.23546875000000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.78515625,
"calib/gap": 0.185753045404208,
"calib/mean_conf": 0.8849218750000001,
"calib/mu_c": 0.9458720930232558,
"calib/mu_w": 0.7601190476190478,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22425781250000004,
"calib/std_conf": 0.21694611980048037,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4082613510520487,
"calib/step_q_c_n": 903.0,
"calib/step_q_gap": 0.05640568094895598,
"calib/step_q_w": 0.3518556701030927,
"calib/step_q_w_n": 485.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1170.0,
"completions/max_terminated_length": 1170.0,
"completions/mean_length": 430.1015625,
"completions/mean_terminated_length": 431.78826904296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.08444429188966751,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.057,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03705844283103943,
"mask/share_reasoning": 0.8286069631576538,
"mask/share_step_conf": 0.13042829930782318,
"num_tokens": 17467452.0,
"reward": 0.9845550060272217,
"reward_std": 0.14298292994499207,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7689882516860962,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8657466173171997,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.5872247219085693,
"adv/mean_abs_reasoning": 0.41632869839668274,
"adv/mean_abs_step_conf": 0.775867223739624,
"adv/ratio_final_to_reasoning": 1.4104834093110126,
"adv/ratio_step_to_reasoning": 1.8635929416529649,
"adv/std_final_conf": 0.8109135031700134,
"adv/std_reasoning": 0.6815744638442993,
"adv/std_step_conf": 0.931576669216156,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6481874999999999,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.2945059288537548,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6245059288537549,
"calib/gap": 0.15327250000000014,
"calib/mean_conf": 0.7945849802371543,
"calib/mu_c": 0.8703125,
"calib/mu_w": 0.7170399999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29158102766798405,
"calib/std_conf": 0.276334572824399,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.37592087312414735,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.07019826240153665,
"calib/step_q_w": 0.3057226107226107,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1665.0,
"completions/max_terminated_length": 1665.0,
"completions/mean_length": 501.54296875,
"completions/mean_terminated_length": 503.50982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.0911051332950592,
"learning_rate": 3.5e-06,
"loss": -0.0789,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03497898578643799,
"mask/share_reasoning": 0.8209805488586426,
"mask/share_step_conf": 0.14013421535491943,
"num_tokens": 17699775.0,
"reward": 0.9126991033554077,
"reward_std": 0.14510872960090637,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6552554368972778,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.873267650604248,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.41595423221588135,
"adv/mean_abs_reasoning": 0.33379027247428894,
"adv/mean_abs_step_conf": 0.7602725625038147,
"adv/ratio_final_to_reasoning": 1.2461544464209073,
"adv/ratio_step_to_reasoning": 2.2776953829964492,
"adv/std_final_conf": 0.6846178770065308,
"adv/std_reasoning": 0.6185281872749329,
"adv/std_step_conf": 0.9292907118797302,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8232046332046332,
"calib/avg_num_step_conf": 6.3046875,
"calib/ece": 0.10133333333333334,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6705882352941176,
"calib/gap": 0.3401737451737451,
"calib/mean_conf": 0.8240784313725492,
"calib/mu_c": 0.9174594594594595,
"calib/mu_w": 0.5772857142857144,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0999607843137255,
"calib/std_conf": 0.25405386514977374,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39514535714285715,
"calib/step_q_c_n": 1120.0,
"calib/step_q_gap": 0.07751377819548871,
"calib/step_q_w": 0.31763157894736843,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1618.0,
"completions/max_terminated_length": 1618.0,
"completions/mean_length": 433.2734375,
"completions/mean_terminated_length": 434.9725646972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.08,
"grad_norm": 0.07387439161539078,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0055,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03749409317970276,
"mask/share_reasoning": 0.8042377829551697,
"mask/share_step_conf": 0.15436190366744995,
"num_tokens": 17915445.0,
"reward": 1.0359642505645752,
"reward_std": 0.08390337228775024,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.8587093353271484,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8694689273834229,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.43272048234939575,
"adv/mean_abs_reasoning": 0.2551872134208679,
"adv/mean_abs_step_conf": 0.7533714771270752,
"adv/ratio_final_to_reasoning": 1.695698136864447,
"adv/ratio_step_to_reasoning": 2.9522305096242265,
"adv/std_final_conf": 0.7211237549781799,
"adv/std_reasoning": 0.5483068227767944,
"adv/std_step_conf": 0.9313459396362305,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7246901597730327,
"calib/avg_num_step_conf": 5.453125,
"calib/ece": 0.14231372549019605,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5294117647058824,
"calib/gap": 0.2826183365686128,
"calib/mean_conf": 0.7153333333333334,
"calib/mu_c": 0.7973480662983425,
"calib/mu_w": 0.5147297297297297,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07392156862745095,
"calib/std_conf": 0.31995800378019384,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4056301652892562,
"calib/step_q_c_n": 968.0,
"calib/step_q_gap": 0.08953203444813468,
"calib/step_q_w": 0.31609813084112154,
"calib/step_q_w_n": 428.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 446.8671875,
"completions/mean_terminated_length": 446.8671875,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.08966308832168579,
"learning_rate": 3.444444444444445e-06,
"loss": -0.031,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0387980118393898,
"mask/share_reasoning": 0.823246955871582,
"mask/share_step_conf": 0.13795502483844757,
"num_tokens": 18132899.0,
"reward": 1.003382682800293,
"reward_std": 0.09004627168178558,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7974995970726013,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8702032566070557,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.6654222011566162,
"adv/mean_abs_reasoning": 0.4536302089691162,
"adv/mean_abs_step_conf": 0.7507677674293518,
"adv/ratio_final_to_reasoning": 1.4668824694651654,
"adv/ratio_step_to_reasoning": 1.6550215408614137,
"adv/std_final_conf": 0.8907448053359985,
"adv/std_reasoning": 0.7392147779464722,
"adv/std_step_conf": 0.9320644736289978,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6540658602150538,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.21644268774703546,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.30039525691699603,
"calib/gap": 0.174858870967742,
"calib/mean_conf": 0.555098814229249,
"calib/mu_c": 0.619375,
"calib/mu_w": 0.444516129032258,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.06956521739130427,
"calib/std_conf": 0.318255636736884,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.33561269146608314,
"calib/step_q_c_n": 914.0,
"calib/step_q_gap": 0.062359850556992225,
"calib/step_q_w": 0.2732528409090909,
"calib/step_q_w_n": 704.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2531.0,
"completions/max_terminated_length": 2531.0,
"completions/mean_length": 461.9609375,
"completions/mean_terminated_length": 463.7725830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.15110744535923004,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0339,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03926243633031845,
"mask/share_reasoning": 0.8083683848381042,
"mask/share_step_conf": 0.1484629511833191,
"num_tokens": 18355825.0,
"reward": 0.9423686265945435,
"reward_std": 0.1237025111913681,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7302652597427368,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8341595530509949,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.651028573513031,
"adv/mean_abs_reasoning": 0.4202715754508972,
"adv/mean_abs_step_conf": 0.7488130331039429,
"adv/ratio_final_to_reasoning": 1.5490663931163113,
"adv/ratio_step_to_reasoning": 1.781736088862453,
"adv/std_final_conf": 0.8752011060714722,
"adv/std_reasoning": 0.6816163063049316,
"adv/std_step_conf": 0.9324920773506165,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7661667315932996,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.10632411067193673,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3241106719367589,
"calib/gap": 0.2798311907544474,
"calib/mean_conf": 0.6242687747035573,
"calib/mu_c": 0.7370860927152317,
"calib/mu_w": 0.45725490196078433,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06687747035573119,
"calib/std_conf": 0.2997402678243543,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34968652037617554,
"calib/step_q_c_n": 957.0,
"calib/step_q_gap": 0.07189916405433644,
"calib/step_q_w": 0.2777873563218391,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 486.23046875,
"completions/mean_terminated_length": 490.0590515136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0832,
"grad_norm": 0.09350687265396118,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0862,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034544557332992554,
"mask/share_reasoning": 0.8210122585296631,
"mask/share_step_conf": 0.13663063943386078,
"num_tokens": 18588324.0,
"reward": 0.9894336462020874,
"reward_std": 0.11710391193628311,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7940328121185303,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8692094683647156,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.5458283424377441,
"adv/mean_abs_reasoning": 0.4055037498474121,
"adv/mean_abs_step_conf": 0.7363581657409668,
"adv/ratio_final_to_reasoning": 1.3460500492119618,
"adv/ratio_step_to_reasoning": 1.8159096334326197,
"adv/std_final_conf": 0.8096640110015869,
"adv/std_reasoning": 0.681533694267273,
"adv/std_step_conf": 0.9306913614273071,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6668287622382156,
"calib/avg_num_step_conf": 6.5078125,
"calib/ece": 0.17761718750000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.47265625,
"calib/gap": 0.167319587628866,
"calib/mean_conf": 0.7166015625000001,
"calib/mu_c": 0.78,
"calib/mu_w": 0.612680412371134,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13656250000000003,
"calib/std_conf": 0.28650059554485846,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3289990186457311,
"calib/step_q_c_n": 1019.0,
"calib/step_q_gap": 0.07335759669828129,
"calib/step_q_w": 0.2556414219474498,
"calib/step_q_w_n": 647.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1188.0,
"completions/max_terminated_length": 1188.0,
"completions/mean_length": 488.79296875,
"completions/mean_terminated_length": 490.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.09503999352455139,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0556,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03435365483164787,
"mask/share_reasoning": 0.8235607147216797,
"mask/share_step_conf": 0.13817936182022095,
"num_tokens": 18819831.0,
"reward": 0.9725010991096497,
"reward_std": 0.09718985855579376,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7486811876296997,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8728834390640259,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6295976638793945,
"adv/mean_abs_reasoning": 0.4713048040866852,
"adv/mean_abs_step_conf": 0.7639672160148621,
"adv/ratio_final_to_reasoning": 1.335860908737088,
"adv/ratio_step_to_reasoning": 1.6209620809940835,
"adv/std_final_conf": 0.8431612253189087,
"adv/std_reasoning": 0.7205964922904968,
"adv/std_step_conf": 0.9322971701622009,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7113510220657913,
"calib/avg_num_step_conf": 6.55078125,
"calib/ece": 0.19917647058823534,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6901960784313725,
"calib/gap": 0.2162758900771624,
"calib/mean_conf": 0.8442745098039216,
"calib/mu_c": 0.9197590361445782,
"calib/mu_w": 0.7034831460674158,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19623529411764712,
"calib/std_conf": 0.24391288049592322,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3613254203758655,
"calib/step_q_c_n": 1011.0,
"calib/step_q_gap": 0.1055146095650547,
"calib/step_q_w": 0.2558108108108108,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2055.0,
"completions/max_terminated_length": 2055.0,
"completions/mean_length": 447.6796875,
"completions/mean_terminated_length": 447.6796875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.0750783234834671,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.038063328713178635,
"mask/share_reasoning": 0.8109001517295837,
"mask/share_step_conf": 0.15103650093078613,
"num_tokens": 19036597.0,
"reward": 0.9909579157829285,
"reward_std": 0.1509701907634735,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7711926102638245,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8818169832229614,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.4786444902420044,
"adv/mean_abs_reasoning": 0.3697603940963745,
"adv/mean_abs_step_conf": 0.746623158454895,
"adv/ratio_final_to_reasoning": 1.2944720361728366,
"adv/ratio_step_to_reasoning": 2.019208034109502,
"adv/std_final_conf": 0.7590821385383606,
"adv/std_reasoning": 0.6612280607223511,
"adv/std_step_conf": 0.9312000274658203,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6862087116725885,
"calib/avg_num_step_conf": 6.4765625,
"calib/ece": 0.2549003984063745,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7131474103585658,
"calib/gap": 0.24628372154230815,
"calib/mean_conf": 0.8377689243027889,
"calib/mu_c": 0.9378523489932883,
"calib/mu_w": 0.6915686274509801,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24952191235059762,
"calib/std_conf": 0.2693621697204938,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3830434782608696,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.15963198939803255,
"calib/step_q_w": 0.22341148886283704,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 494.91796875,
"completions/mean_terminated_length": 496.8588562011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.0864,
"grad_norm": 0.08149909973144531,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.04,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03588252514600754,
"mask/share_reasoning": 0.8174002170562744,
"mask/share_step_conf": 0.14281101524829865,
"num_tokens": 19269544.0,
"reward": 0.943608283996582,
"reward_std": 0.14294570684432983,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7283687591552734,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8471290469169617,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6110142469406128,
"adv/mean_abs_reasoning": 0.5129064321517944,
"adv/mean_abs_step_conf": 0.75091153383255,
"adv/ratio_final_to_reasoning": 1.1912781915742936,
"adv/ratio_step_to_reasoning": 1.4640322030711406,
"adv/std_final_conf": 0.8275887966156006,
"adv/std_reasoning": 0.7753466367721558,
"adv/std_step_conf": 0.9330487847328186,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6737028681920723,
"calib/avg_num_step_conf": 6.59375,
"calib/ece": 0.28015873015873016,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7103174603174603,
"calib/gap": 0.17536577505639717,
"calib/mean_conf": 0.8317460317460318,
"calib/mu_c": 0.9062068965517242,
"calib/mu_w": 0.730841121495327,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26825396825396824,
"calib/std_conf": 0.2665091182366948,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.383,
"calib/step_q_c_n": 790.0,
"calib/step_q_gap": 0.14293318485523387,
"calib/step_q_w": 0.24006681514476613,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2664.0,
"completions/max_terminated_length": 2664.0,
"completions/mean_length": 440.87890625,
"completions/mean_terminated_length": 446.10675048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.09818227589130402,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0946,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037333473563194275,
"mask/share_reasoning": 0.805363118648529,
"mask/share_step_conf": 0.14558462798595428,
"num_tokens": 19487961.0,
"reward": 0.9272757768630981,
"reward_std": 0.1902877688407898,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6936218738555908,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8507733345031738,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.5640679597854614,
"adv/mean_abs_reasoning": 0.3593941926956177,
"adv/mean_abs_step_conf": 0.7403038740158081,
"adv/ratio_final_to_reasoning": 1.569496589676919,
"adv/ratio_step_to_reasoning": 2.059865988549222,
"adv/std_final_conf": 0.7934759259223938,
"adv/std_reasoning": 0.6402297019958496,
"adv/std_step_conf": 0.9326952695846558,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6180901432699993,
"calib/avg_num_step_conf": 6.44921875,
"calib/ece": 0.3313281249999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.73828125,
"calib/gap": 0.1460898973129191,
"calib/mean_conf": 0.8453906250000001,
"calib/mu_c": 0.912158273381295,
"calib/mu_w": 0.7660683760683759,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3168749999999999,
"calib/std_conf": 0.27356558384071156,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3712214765100671,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.10336275686326801,
"calib/step_q_w": 0.2678587196467991,
"calib/step_q_w_n": 906.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1652.0,
"completions/max_terminated_length": 1652.0,
"completions/mean_length": 507.12890625,
"completions/mean_terminated_length": 509.11767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.09128120541572571,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0273,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0362110435962677,
"mask/share_reasoning": 0.8244807124137878,
"mask/share_step_conf": 0.13540202379226685,
"num_tokens": 19725050.0,
"reward": 0.9137445688247681,
"reward_std": 0.13943439722061157,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6542484164237976,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8654282093048096,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.558834969997406,
"adv/mean_abs_reasoning": 0.36879509687423706,
"adv/mean_abs_step_conf": 0.7573169469833374,
"adv/ratio_final_to_reasoning": 1.5152993484291752,
"adv/ratio_step_to_reasoning": 2.0534897383453834,
"adv/std_final_conf": 0.7958536744117737,
"adv/std_reasoning": 0.6402270197868347,
"adv/std_step_conf": 0.9319294095039368,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6387340517775301,
"calib/avg_num_step_conf": 5.2890625,
"calib/ece": 0.3536078431372548,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.796078431372549,
"calib/gap": 0.1412244518766257,
"calib/mean_conf": 0.8860000000000001,
"calib/mu_c": 0.9507971014492754,
"calib/mu_w": 0.8095726495726497,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3492156862745097,
"calib/std_conf": 0.22265175481358468,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41620886981402005,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.11069741943234063,
"calib/step_q_w": 0.30551145038167943,
"calib/step_q_w_n": 655.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2416.0,
"completions/max_terminated_length": 2416.0,
"completions/mean_length": 417.390625,
"completions/mean_terminated_length": 417.390625,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.0896,
"grad_norm": 0.07120621204376221,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0132,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.040930844843387604,
"mask/share_reasoning": 0.8206195831298828,
"mask/share_step_conf": 0.13844957947731018,
"num_tokens": 19937822.0,
"reward": 0.9140485525131226,
"reward_std": 0.1578373908996582,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.650799572467804,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8702661991119385,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.5749181509017944,
"adv/mean_abs_reasoning": 0.47961947321891785,
"adv/mean_abs_step_conf": 0.7351856827735901,
"adv/ratio_final_to_reasoning": 1.1986964312422286,
"adv/ratio_step_to_reasoning": 1.5328520292127952,
"adv/std_final_conf": 0.8260165452957153,
"adv/std_reasoning": 0.7574634552001953,
"adv/std_step_conf": 0.9331307411193848,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7308958306441203,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.31855421686746993,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7108433734939759,
"calib/gap": 0.2720536982057571,
"calib/mean_conf": 0.8285943775100402,
"calib/mu_c": 0.9618897637795275,
"calib/mu_w": 0.6898360655737704,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31855421686746993,
"calib/std_conf": 0.2787479755334017,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.39964529331514326,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.14666703244557805,
"calib/step_q_w": 0.2529782608695652,
"calib/step_q_w_n": 920.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2173.0,
"completions/max_terminated_length": 2173.0,
"completions/mean_length": 486.53125,
"completions/mean_terminated_length": 492.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.08083637058734894,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0126,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03679840639233589,
"mask/share_reasoning": 0.8089617490768433,
"mask/share_step_conf": 0.14252111315727234,
"num_tokens": 20170198.0,
"reward": 0.9124147891998291,
"reward_std": 0.1984560340642929,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6802054643630981,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8524364829063416,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.5518361330032349,
"adv/mean_abs_reasoning": 0.40247124433517456,
"adv/mean_abs_step_conf": 0.7445517778396606,
"adv/ratio_final_to_reasoning": 1.3711194048528608,
"adv/ratio_step_to_reasoning": 1.8499502469289566,
"adv/std_final_conf": 0.7940528392791748,
"adv/std_reasoning": 0.681560218334198,
"adv/std_step_conf": 0.9328047633171082,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.68415503875969,
"calib/avg_num_step_conf": 6.01171875,
"calib/ece": 0.3247244094488189,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7007874015748031,
"calib/gap": 0.1996527131782947,
"calib/mean_conf": 0.8325984251968505,
"calib/mu_c": 0.9308527131782947,
"calib/mu_w": 0.7312,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3247244094488189,
"calib/std_conf": 0.2575924651610683,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.38953191489361705,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.10978371345476812,
"calib/step_q_w": 0.27974820143884893,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 453.5703125,
"completions/mean_terminated_length": 455.34906005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.0909588634967804,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0044,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04099898040294647,
"mask/share_reasoning": 0.8147042989730835,
"mask/share_step_conf": 0.14039045572280884,
"num_tokens": 20391824.0,
"reward": 0.9142040014266968,
"reward_std": 0.1749543696641922,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6614238023757935,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8701090812683105,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.4856010675430298,
"adv/mean_abs_reasoning": 0.42319798469543457,
"adv/mean_abs_step_conf": 0.7525032758712769,
"adv/ratio_final_to_reasoning": 1.1474560019289912,
"adv/ratio_step_to_reasoning": 1.7781353009344678,
"adv/std_final_conf": 0.7397770881652832,
"adv/std_reasoning": 0.6816813945770264,
"adv/std_step_conf": 0.9327853918075562,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5689840392879066,
"calib/avg_num_step_conf": 5.2734375,
"calib/ece": 0.24723320158102774,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9288537549407114,
"calib/gap": 0.060748925721301306,
"calib/mean_conf": 0.9579051383399211,
"calib/mu_c": 0.9751933701657458,
"calib/mu_w": 0.9144444444444445,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24486166007905144,
"calib/std_conf": 0.1267812177227895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4237377049180328,
"calib/step_q_c_n": 915.0,
"calib/step_q_gap": 0.08746184284906727,
"calib/step_q_w": 0.33627586206896554,
"calib/step_q_w_n": 435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2326.0,
"completions/max_terminated_length": 2326.0,
"completions/mean_length": 413.2265625,
"completions/mean_terminated_length": 414.8470764160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.0928,
"grad_norm": 0.07672244310379028,
"learning_rate": 3.138888888888889e-06,
"loss": 0.1069,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04202680289745331,
"mask/share_reasoning": 0.8173971176147461,
"mask/share_step_conf": 0.13666987419128418,
"num_tokens": 20603106.0,
"reward": 0.9694166779518127,
"reward_std": 0.15908363461494446,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7375199198722839,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8622509837150574,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.5365962982177734,
"adv/mean_abs_reasoning": 0.47331148386001587,
"adv/mean_abs_step_conf": 0.7489129304885864,
"adv/ratio_final_to_reasoning": 1.1337064840296045,
"adv/ratio_step_to_reasoning": 1.5822834560888892,
"adv/std_final_conf": 0.7939905524253845,
"adv/std_reasoning": 0.7574472427368164,
"adv/std_step_conf": 0.9319527745246887,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7550013702384215,
"calib/avg_num_step_conf": 5.91796875,
"calib/ece": 0.20233201581027666,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7391304347826086,
"calib/gap": 0.300604275143875,
"calib/mean_conf": 0.8419367588932808,
"calib/mu_c": 0.9476829268292682,
"calib/mu_w": 0.6470786516853932,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19802371541501973,
"calib/std_conf": 0.27218302802277267,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37860119047619045,
"calib/step_q_c_n": 1008.0,
"calib/step_q_gap": 0.09512979008171313,
"calib/step_q_w": 0.2834714003944773,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 489.859375,
"completions/mean_terminated_length": 489.859375,
"completions/min_length": 92.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.10223963111639023,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.1124,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03621866554021835,
"mask/share_reasoning": 0.8318451642990112,
"mask/share_step_conf": 0.13193616271018982,
"num_tokens": 20838358.0,
"reward": 0.9859171509742737,
"reward_std": 0.17374292016029358,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7881089448928833,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8579440116882324,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.5738560557365417,
"adv/mean_abs_reasoning": 0.3232640326023102,
"adv/mean_abs_step_conf": 0.7356339693069458,
"adv/ratio_final_to_reasoning": 1.7751930244665293,
"adv/ratio_step_to_reasoning": 2.2756443498678567,
"adv/std_final_conf": 0.7938140630722046,
"adv/std_reasoning": 0.6185097694396973,
"adv/std_step_conf": 0.9329736828804016,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.715201870221773,
"calib/avg_num_step_conf": 6.11328125,
"calib/ece": 0.24146825396825383,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.23565236620964214,
"calib/mean_conf": 0.7692460317460318,
"calib/mu_c": 0.8805263157894739,
"calib/mu_w": 0.6448739495798318,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24146825396825383,
"calib/std_conf": 0.28588456951613805,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3869816272965879,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.11916095481838118,
"calib/step_q_w": 0.26782067247820673,
"calib/step_q_w_n": 803.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1737.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 465.7109375,
"completions/mean_terminated_length": 473.10321044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.0948285236954689,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.1544,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.039537109434604645,
"mask/share_reasoning": 0.8073158264160156,
"mask/share_step_conf": 0.13752208650112152,
"num_tokens": 21066468.0,
"reward": 0.9474252462387085,
"reward_std": 0.14904716610908508,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.716819167137146,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8772500157356262,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.5179926156997681,
"adv/mean_abs_reasoning": 0.36165764927864075,
"adv/mean_abs_step_conf": 0.7658787965774536,
"adv/ratio_final_to_reasoning": 1.4322733577817355,
"adv/ratio_step_to_reasoning": 2.117690025650139,
"adv/std_final_conf": 0.7583214640617371,
"adv/std_reasoning": 0.6402938365936279,
"adv/std_step_conf": 0.9270572662353516,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6684695512820513,
"calib/avg_num_step_conf": 6.22265625,
"calib/ece": 0.23126984126984115,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6388888888888888,
"calib/gap": 0.20793269230769218,
"calib/mean_conf": 0.7830952380952382,
"calib/mu_c": 0.8623076923076922,
"calib/mu_w": 0.654375,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19765873015873003,
"calib/std_conf": 0.2968350626556248,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36621276595744684,
"calib/step_q_c_n": 940.0,
"calib/step_q_gap": 0.10798918249649736,
"calib/step_q_w": 0.2582235834609495,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 470.0390625,
"completions/mean_terminated_length": 471.88238525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.096,
"grad_norm": 0.11417663842439651,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0411,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.038944244384765625,
"mask/share_reasoning": 0.8182889223098755,
"mask/share_step_conf": 0.1388605684041977,
"num_tokens": 21290118.0,
"reward": 0.9442144632339478,
"reward_std": 0.1354803889989853,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.731640636920929,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8388195037841797,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.544395923614502,
"adv/mean_abs_reasoning": 0.48889443278312683,
"adv/mean_abs_step_conf": 0.7743960022926331,
"adv/ratio_final_to_reasoning": 1.1135244893573897,
"adv/ratio_step_to_reasoning": 1.583973860950375,
"adv/std_final_conf": 0.7938852310180664,
"adv/std_reasoning": 0.739333987236023,
"adv/std_step_conf": 0.9330266118049622,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6385478547854786,
"calib/avg_num_step_conf": 6.1171875,
"calib/ece": 0.23681274900398405,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.166839603960396,
"calib/mean_conf": 0.8036653386454184,
"calib/mu_c": 0.8708,
"calib/mu_w": 0.703960396039604,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22143426294820714,
"calib/std_conf": 0.27356309052173255,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38259675405742827,
"calib/step_q_c_n": 801.0,
"calib/step_q_gap": 0.0902045971946831,
"calib/step_q_w": 0.29239215686274517,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 476.4375,
"completions/mean_terminated_length": 478.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.07549665868282318,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0288,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034715428948402405,
"mask/share_reasoning": 0.8294082880020142,
"mask/share_step_conf": 0.13197004795074463,
"num_tokens": 21519798.0,
"reward": 0.9312552213668823,
"reward_std": 0.15992335975170135,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7083617448806763,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8400861024856567,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.6064884662628174,
"adv/mean_abs_reasoning": 0.5143711566925049,
"adv/mean_abs_step_conf": 0.7511153221130371,
"adv/ratio_final_to_reasoning": 1.1790872376333126,
"adv/ratio_step_to_reasoning": 1.4602594106225513,
"adv/std_final_conf": 0.8270331621170044,
"adv/std_reasoning": 0.7575141787528992,
"adv/std_step_conf": 0.9315164685249329,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7373693379790942,
"calib/avg_num_step_conf": 5.2265625,
"calib/ece": 0.14392156862745092,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5411764705882353,
"calib/gap": 0.2554147681586706,
"calib/mean_conf": 0.7441568627450981,
"calib/mu_c": 0.8353048780487805,
"calib/mu_w": 0.5798901098901099,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12247058823529408,
"calib/std_conf": 0.28358899924383246,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3873967889908257,
"calib/step_q_c_n": 872.0,
"calib/step_q_gap": 0.0632766173170059,
"calib/step_q_w": 0.3241201716738198,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1295.0,
"completions/max_terminated_length": 1295.0,
"completions/mean_length": 402.015625,
"completions/mean_terminated_length": 403.5921936035156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.0977119654417038,
"learning_rate": 3e-06,
"loss": -0.0141,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0412004217505455,
"mask/share_reasoning": 0.8171786665916443,
"mask/share_step_conf": 0.1377146691083908,
"num_tokens": 21729434.0,
"reward": 1.0000758171081543,
"reward_std": 0.12583737075328827,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7939882278442383,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8780382871627808,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.5656777620315552,
"adv/mean_abs_reasoning": 0.33399784564971924,
"adv/mean_abs_step_conf": 0.7464229464530945,
"adv/ratio_final_to_reasoning": 1.6936569184485417,
"adv/ratio_step_to_reasoning": 2.2348136557620397,
"adv/std_final_conf": 0.8103609085083008,
"adv/std_reasoning": 0.5963025689125061,
"adv/std_step_conf": 0.9319359064102173,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7271986417657047,
"calib/avg_num_step_conf": 6.34375,
"calib/ece": 0.14815999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.532,
"calib/gap": 0.22942614601018663,
"calib/mean_conf": 0.75656,
"calib/mu_c": 0.8437419354838711,
"calib/mu_w": 0.6143157894736845,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14235999999999996,
"calib/std_conf": 0.2669804607082698,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.411528384279476,
"calib/step_q_c_n": 916.0,
"calib/step_q_gap": 0.08777132213258337,
"calib/step_q_w": 0.32375706214689265,
"calib/step_q_w_n": 708.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2319.0,
"completions/max_terminated_length": 2319.0,
"completions/mean_length": 528.00390625,
"completions/mean_terminated_length": 528.00390625,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.0992,
"grad_norm": 0.10494104772806168,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0203,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.036547817289829254,
"mask/share_reasoning": 0.8225446939468384,
"mask/share_step_conf": 0.14090751111507416,
"num_tokens": 21970379.0,
"reward": 0.9656751751899719,
"reward_std": 0.13884666562080383,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7642366886138916,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8507072925567627,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.5616555213928223,
"adv/mean_abs_reasoning": 0.3449043333530426,
"adv/mean_abs_step_conf": 0.7426949739456177,
"adv/ratio_final_to_reasoning": 1.6284385758004207,
"adv/ratio_step_to_reasoning": 2.1533361634670976,
"adv/std_final_conf": 0.810353696346283,
"adv/std_reasoning": 0.6401544809341431,
"adv/std_step_conf": 0.9311507940292358,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7501669337606839,
"calib/avg_num_step_conf": 5.984375,
"calib/ece": 0.15476190476190482,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5158730158730159,
"calib/gap": 0.23957532051282038,
"calib/mean_conf": 0.7634126984126984,
"calib/mu_c": 0.854679487179487,
"calib/mu_w": 0.6151041666666667,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14956349206349212,
"calib/std_conf": 0.2580056683836915,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4527167630057804,
"calib/step_q_c_n": 865.0,
"calib/step_q_gap": 0.1465848289727969,
"calib/step_q_w": 0.3061319340329835,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2253.0,
"completions/max_terminated_length": 2253.0,
"completions/mean_length": 452.3125,
"completions/mean_terminated_length": 457.6759033203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.10385382175445557,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0691,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0380202978849411,
"mask/share_reasoning": 0.8159376978874207,
"mask/share_step_conf": 0.13432320952415466,
"num_tokens": 22194851.0,
"reward": 0.9824118614196777,
"reward_std": 0.12148786336183548,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7774211168289185,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8686525821685791,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5447875261306763,
"adv/mean_abs_reasoning": 0.4278485178947449,
"adv/mean_abs_step_conf": 0.7348989248275757,
"adv/ratio_final_to_reasoning": 1.273318717595043,
"adv/ratio_step_to_reasoning": 1.7176614948760167,
"adv/std_final_conf": 0.7937759160995483,
"adv/std_reasoning": 0.6816493272781372,
"adv/std_step_conf": 0.9309818148612976,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6389552635072349,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.23784313725490178,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7176470588235294,
"calib/gap": 0.13794504181600975,
"calib/mean_conf": 0.8537647058823531,
"calib/mu_c": 0.9040740740740739,
"calib/mu_w": 0.7661290322580642,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22815686274509786,
"calib/std_conf": 0.23772274081253417,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4700542299349241,
"calib/step_q_c_n": 922.0,
"calib/step_q_gap": 0.06599172993492408,
"calib/step_q_w": 0.4040625,
"calib/step_q_w_n": 512.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1534.0,
"completions/max_terminated_length": 1534.0,
"completions/mean_length": 458.53125,
"completions/mean_terminated_length": 460.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.09247266501188278,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0648,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.040411949157714844,
"mask/share_reasoning": 0.8161001801490784,
"mask/share_step_conf": 0.1395815908908844,
"num_tokens": 22418363.0,
"reward": 0.9670491218566895,
"reward_std": 0.1566634327173233,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7251417636871338,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8831750750541687,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.40070587396621704,
"adv/mean_abs_reasoning": 0.34994882345199585,
"adv/mean_abs_step_conf": 0.7387731075286865,
"adv/ratio_final_to_reasoning": 1.1450413520855394,
"adv/ratio_step_to_reasoning": 2.111088987930338,
"adv/std_final_conf": 0.6824913024902344,
"adv/std_reasoning": 0.6401877403259277,
"adv/std_step_conf": 0.9318387508392334,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7476726726726727,
"calib/avg_num_step_conf": 5.68359375,
"calib/ece": 0.18413385826771655,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7952755905511811,
"calib/gap": 0.2388753753753753,
"calib/mean_conf": 0.8927952755905514,
"calib/mu_c": 0.9623888888888888,
"calib/mu_w": 0.7235135135135136,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18413385826771655,
"calib/std_conf": 0.20217435519605295,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4986129032258065,
"calib/step_q_c_n": 930.0,
"calib/step_q_gap": 0.14602242703533025,
"calib/step_q_w": 0.35259047619047623,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1411.0,
"completions/max_terminated_length": 1411.0,
"completions/mean_length": 411.37109375,
"completions/mean_terminated_length": 414.6102294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1024,
"grad_norm": 0.08027921617031097,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0225,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04036235064268112,
"mask/share_reasoning": 0.8120318651199341,
"mask/share_step_conf": 0.1397933065891266,
"num_tokens": 22629490.0,
"reward": 0.9985183477401733,
"reward_std": 0.1332436501979828,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.8084640502929688,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8502914905548096,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.47524163126945496,
"adv/mean_abs_reasoning": 0.38075166940689087,
"adv/mean_abs_step_conf": 0.7631211280822754,
"adv/ratio_final_to_reasoning": 1.2481669010401297,
"adv/ratio_step_to_reasoning": 2.004248935457102,
"adv/std_final_conf": 0.7217053174972534,
"adv/std_reasoning": 0.6402749419212341,
"adv/std_step_conf": 0.9316134452819824,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6074507389162562,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.38542968750000006,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.859375,
"calib/gap": 0.08488916256157597,
"calib/mean_conf": 0.9283203125000001,
"calib/mu_c": 0.966785714285714,
"calib/mu_w": 0.8818965517241381,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38343750000000004,
"calib/std_conf": 0.16455405806269968,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5338696808510639,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.12172783687943267,
"calib/step_q_w": 0.4121418439716312,
"calib/step_q_w_n": 705.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1227.0,
"completions/max_terminated_length": 1227.0,
"completions/mean_length": 405.8125,
"completions/mean_terminated_length": 407.4039306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.09753809124231339,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0108,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04243121296167374,
"mask/share_reasoning": 0.8089368343353271,
"mask/share_step_conf": 0.14472568035125732,
"num_tokens": 22838450.0,
"reward": 0.8967458009719849,
"reward_std": 0.1497422307729721,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6216902732849121,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.862426221370697,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.43978482484817505,
"adv/mean_abs_reasoning": 0.45589399337768555,
"adv/mean_abs_step_conf": 0.7524597644805908,
"adv/ratio_final_to_reasoning": 0.9646646616022316,
"adv/ratio_step_to_reasoning": 1.6505147587176374,
"adv/std_final_conf": 0.721373438835144,
"adv/std_reasoning": 0.7206504940986633,
"adv/std_step_conf": 0.9322798848152161,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6187590187590187,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.24927710843373502,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8393574297188755,
"calib/gap": 0.14336147186147152,
"calib/mean_conf": 0.8998795180722894,
"calib/mu_c": 0.9482424242424241,
"calib/mu_w": 0.8048809523809526,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24325301204819286,
"calib/std_conf": 0.21657146242178518,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5628714107365793,
"calib/step_q_c_n": 801.0,
"calib/step_q_gap": 0.1687890779750012,
"calib/step_q_w": 0.3940823327615781,
"calib/step_q_w_n": 583.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2984.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 467.34765625,
"completions/mean_terminated_length": 471.0275573730469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.07917863130569458,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0165,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04089288413524628,
"mask/share_reasoning": 0.8173341155052185,
"mask/share_step_conf": 0.13396045565605164,
"num_tokens": 23064275.0,
"reward": 0.943149209022522,
"reward_std": 0.15866869688034058,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7172074317932129,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8456535339355469,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.5576080679893494,
"adv/mean_abs_reasoning": 0.4876534342765808,
"adv/mean_abs_step_conf": 0.7644209265708923,
"adv/ratio_final_to_reasoning": 1.1434515350364427,
"adv/ratio_step_to_reasoning": 1.5675495604883574,
"adv/std_final_conf": 0.7939621210098267,
"adv/std_reasoning": 0.7575070261955261,
"adv/std_step_conf": 0.9333102107048035,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6657533383723859,
"calib/avg_num_step_conf": 5.76171875,
"calib/ece": 0.3549603174603174,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7380952380952381,
"calib/gap": 0.16452380952380952,
"calib/mean_conf": 0.8549603174603175,
"calib/mu_c": 0.9372222222222221,
"calib/mu_w": 0.7726984126984126,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3549603174603174,
"calib/std_conf": 0.23729042217721738,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5305287009063445,
"calib/step_q_c_n": 662.0,
"calib/step_q_gap": 0.1325951215705511,
"calib/step_q_w": 0.39793357933579343,
"calib/step_q_w_n": 813.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2681.0,
"completions/max_terminated_length": 2681.0,
"completions/mean_length": 504.19140625,
"completions/mean_terminated_length": 510.16998291015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.1056,
"grad_norm": 0.07687898725271225,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0539,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03424970805644989,
"mask/share_reasoning": 0.8309434056282043,
"mask/share_step_conf": 0.12308812141418457,
"num_tokens": 23299148.0,
"reward": 0.8982889652252197,
"reward_std": 0.18456372618675232,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6398026943206787,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8614625930786133,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.47142088413238525,
"adv/mean_abs_reasoning": 0.3072330355644226,
"adv/mean_abs_step_conf": 0.7255579233169556,
"adv/ratio_final_to_reasoning": 1.53440818389315,
"adv/ratio_step_to_reasoning": 2.3615882386606692,
"adv/std_final_conf": 0.7401483058929443,
"adv/std_reasoning": 0.5961126685142517,
"adv/std_step_conf": 0.929786741733551,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8007024265644955,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.245098814229249,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6877470355731226,
"calib/gap": 0.3496711366538954,
"calib/mean_conf": 0.8114229249011858,
"calib/mu_c": 0.9606896551724139,
"calib/mu_w": 0.6110185185185185,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.241699604743083,
"calib/std_conf": 0.27266861372953083,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.530967365967366,
"calib/step_q_c_n": 858.0,
"calib/step_q_gap": 0.1171056458999124,
"calib/step_q_w": 0.41386172006745364,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 488.44921875,
"completions/mean_terminated_length": 492.2952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.08317389339208603,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0064,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037921108305454254,
"mask/share_reasoning": 0.8238252401351929,
"mask/share_step_conf": 0.13044117391109467,
"num_tokens": 23531599.0,
"reward": 0.9863134026527405,
"reward_std": 0.1421002447605133,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7859878540039062,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8757014274597168,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.5873388051986694,
"adv/mean_abs_reasoning": 0.4507104158401489,
"adv/mean_abs_step_conf": 0.7487509250640869,
"adv/ratio_final_to_reasoning": 1.3031400752162288,
"adv/ratio_step_to_reasoning": 1.6612682972244477,
"adv/std_final_conf": 0.8106685280799866,
"adv/std_reasoning": 0.7014285326004028,
"adv/std_step_conf": 0.9310752749443054,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7089002267573697,
"calib/avg_num_step_conf": 6.23046875,
"calib/ece": 0.2904365079365079,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5595238095238095,
"calib/gap": 0.21246031746031757,
"calib/mean_conf": 0.7738492063492064,
"calib/mu_c": 0.880079365079365,
"calib/mu_w": 0.6676190476190474,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.28214285714285703,
"calib/std_conf": 0.2642507097033709,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4832345679012346,
"calib/step_q_c_n": 810.0,
"calib/step_q_gap": 0.08561673350633014,
"calib/step_q_w": 0.39761783439490445,
"calib/step_q_w_n": 785.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2518.0,
"completions/max_terminated_length": 2518.0,
"completions/mean_length": 509.09765625,
"completions/mean_terminated_length": 511.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.10075970739126205,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0249,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035582467913627625,
"mask/share_reasoning": 0.8273746967315674,
"mask/share_step_conf": 0.13313661515712738,
"num_tokens": 23768920.0,
"reward": 0.9301528930664062,
"reward_std": 0.16000109910964966,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7002925872802734,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8647007346153259,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.4486492872238159,
"adv/mean_abs_reasoning": 0.3463757038116455,
"adv/mean_abs_step_conf": 0.7536967992782593,
"adv/ratio_final_to_reasoning": 1.2952677750971398,
"adv/ratio_step_to_reasoning": 2.1759516934481917,
"adv/std_final_conf": 0.740336537361145,
"adv/std_reasoning": 0.6402208805084229,
"adv/std_step_conf": 0.9320363402366638,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.728178953137846,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.20853754940711444,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7509881422924901,
"calib/gap": 0.25324403946286633,
"calib/mean_conf": 0.8557312252964429,
"calib/mu_c": 0.9448170731707316,
"calib/mu_w": 0.6915730337078653,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2080237154150196,
"calib/std_conf": 0.24362536621728567,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5029542645241039,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.0861121592609459,
"calib/step_q_w": 0.41684210526315796,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2359.0,
"completions/max_terminated_length": 2359.0,
"completions/mean_length": 416.1171875,
"completions/mean_terminated_length": 419.3937072753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.1088,
"grad_norm": 0.07325369119644165,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0532,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04374309629201889,
"mask/share_reasoning": 0.8045555353164673,
"mask/share_step_conf": 0.14388886094093323,
"num_tokens": 23982142.0,
"reward": 0.978011965751648,
"reward_std": 0.1434057354927063,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7757730484008789,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8552509546279907,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.4731312394142151,
"adv/mean_abs_reasoning": 0.3409392833709717,
"adv/mean_abs_step_conf": 0.7478026747703552,
"adv/ratio_final_to_reasoning": 1.387728732037625,
"adv/ratio_step_to_reasoning": 2.193360258684772,
"adv/std_final_conf": 0.739928662776947,
"adv/std_reasoning": 0.6185459494590759,
"adv/std_step_conf": 0.9310590624809265,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7711931183925406,
"calib/avg_num_step_conf": 5.1953125,
"calib/ece": 0.1794488188976377,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6535433070866141,
"calib/gap": 0.30495239345984615,
"calib/mean_conf": 0.7965354330708664,
"calib/mu_c": 0.9129936305732482,
"calib/mu_w": 0.6080412371134021,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17893700787401562,
"calib/std_conf": 0.277089422311816,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5100984009840099,
"calib/step_q_c_n": 813.0,
"calib/step_q_gap": 0.09677151510393256,
"calib/step_q_w": 0.4133268858800774,
"calib/step_q_w_n": 517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2422.0,
"completions/max_terminated_length": 2422.0,
"completions/mean_length": 509.796875,
"completions/mean_terminated_length": 509.796875,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.1110922247171402,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0246,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03668327257037163,
"mask/share_reasoning": 0.8423339128494263,
"mask/share_step_conf": 0.1209828332066536,
"num_tokens": 24217202.0,
"reward": 0.9978973269462585,
"reward_std": 0.12469913065433502,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7930593490600586,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8816415667533875,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6043910980224609,
"adv/mean_abs_reasoning": 0.42754340171813965,
"adv/mean_abs_step_conf": 0.739775538444519,
"adv/ratio_final_to_reasoning": 1.4136368274978293,
"adv/ratio_step_to_reasoning": 1.730293428624166,
"adv/std_final_conf": 0.8272894620895386,
"adv/std_reasoning": 0.7013717293739319,
"adv/std_step_conf": 0.9322694540023804,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7389520202020202,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.2618650793650794,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5158730158730159,
"calib/gap": 0.2524621212121214,
"calib/mean_conf": 0.7323412698412699,
"calib/mu_c": 0.8645833333333335,
"calib/mu_w": 0.6121212121212121,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2590079365079366,
"calib/std_conf": 0.2834293860350912,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4800479233226837,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.10273499029727956,
"calib/step_q_w": 0.37731293302540414,
"calib/step_q_w_n": 866.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2981.0,
"completions/max_terminated_length": 2981.0,
"completions/mean_length": 494.734375,
"completions/mean_terminated_length": 494.734375,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.14319024980068207,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0643,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03690874204039574,
"mask/share_reasoning": 0.8321300745010376,
"mask/share_step_conf": 0.13096114993095398,
"num_tokens": 24450534.0,
"reward": 0.9378859996795654,
"reward_std": 0.14923880994319916,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7191511392593384,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8659958839416504,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.6729190349578857,
"adv/mean_abs_reasoning": 0.45059072971343994,
"adv/mean_abs_step_conf": 0.7314640283584595,
"adv/ratio_final_to_reasoning": 1.4934151783056853,
"adv/ratio_step_to_reasoning": 1.6233446010388302,
"adv/std_final_conf": 0.8753982782363892,
"adv/std_reasoning": 0.7392308712005615,
"adv/std_step_conf": 0.9325223565101624,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6679465668559628,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.2029249011857707,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6086956521739131,
"calib/gap": 0.1823289881259681,
"calib/mean_conf": 0.7676679841897234,
"calib/mu_c": 0.8426174496644296,
"calib/mu_w": 0.6602884615384615,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19083003952569164,
"calib/std_conf": 0.2887681627548953,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4685986394557824,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.10343297680517993,
"calib/step_q_w": 0.36516566265060246,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2292.0,
"completions/max_terminated_length": 2292.0,
"completions/mean_length": 456.05078125,
"completions/mean_terminated_length": 457.8392333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.112,
"grad_norm": 0.07508841156959534,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0351,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0396403968334198,
"mask/share_reasoning": 0.8274632692337036,
"mask/share_step_conf": 0.12899011373519897,
"num_tokens": 24673043.0,
"reward": 0.9453893899917603,
"reward_std": 0.17942391335964203,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7189491987228394,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8585482239723206,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.5386701822280884,
"adv/mean_abs_reasoning": 0.37390226125717163,
"adv/mean_abs_step_conf": 0.7243057489395142,
"adv/ratio_final_to_reasoning": 1.4406711005622634,
"adv/ratio_step_to_reasoning": 1.937152630487392,
"adv/std_final_conf": 0.7936661839485168,
"adv/std_reasoning": 0.6814560294151306,
"adv/std_step_conf": 0.931763768196106,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7666728763040238,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.28248031496063,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6692913385826772,
"calib/gap": 0.3149528067560855,
"calib/mean_conf": 0.8012992125984253,
"calib/mu_c": 0.9525757575757576,
"calib/mu_w": 0.6376229508196721,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2820472440944882,
"calib/std_conf": 0.2834772224865189,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4911273792093705,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.10643517674699016,
"calib/step_q_w": 0.38469220246238034,
"calib/step_q_w_n": 731.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 463.83984375,
"completions/mean_terminated_length": 465.6588439941406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.11216479539871216,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0928,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03690160810947418,
"mask/share_reasoning": 0.8339164853096008,
"mask/share_step_conf": 0.1252756118774414,
"num_tokens": 24896370.0,
"reward": 0.9552474021911621,
"reward_std": 0.1446411907672882,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7421104907989502,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8668216466903687,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.5357671976089478,
"adv/mean_abs_reasoning": 0.4808008074760437,
"adv/mean_abs_step_conf": 0.7439740896224976,
"adv/ratio_final_to_reasoning": 1.1143225828206265,
"adv/ratio_step_to_reasoning": 1.5473644762120469,
"adv/std_final_conf": 0.7937635183334351,
"adv/std_reasoning": 0.7392998933792114,
"adv/std_step_conf": 0.932767391204834,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7604247104247105,
"calib/avg_num_step_conf": 5.9140625,
"calib/ece": 0.191383399209486,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6284584980237155,
"calib/gap": 0.31738738738738725,
"calib/mean_conf": 0.7723320158102768,
"calib/mu_c": 0.9040540540540539,
"calib/mu_w": 0.5866666666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18936758893280614,
"calib/std_conf": 0.29928719079143,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4520554854981085,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.08247157426371177,
"calib/step_q_w": 0.3695839112343967,
"calib/step_q_w_n": 721.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 460.2421875,
"completions/mean_terminated_length": 463.86614990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.059717439115047455,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.0756,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03775336593389511,
"mask/share_reasoning": 0.8178339004516602,
"mask/share_step_conf": 0.13660022616386414,
"num_tokens": 25118808.0,
"reward": 0.9761337637901306,
"reward_std": 0.1640060544013977,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7751832008361816,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8645843267440796,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.5327150225639343,
"adv/mean_abs_reasoning": 0.40658038854599,
"adv/mean_abs_step_conf": 0.751448392868042,
"adv/ratio_final_to_reasoning": 1.310232951640944,
"adv/ratio_step_to_reasoning": 1.8482160331327504,
"adv/std_final_conf": 0.7762723565101624,
"adv/std_reasoning": 0.6815800666809082,
"adv/std_step_conf": 0.9327512383460999,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6519259259259259,
"calib/avg_num_step_conf": 5.08203125,
"calib/ece": 0.18400000000000008,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7647058823529411,
"calib/gap": 0.18659999999999977,
"calib/mean_conf": 0.8554509803921571,
"calib/mu_c": 0.9103333333333332,
"calib/mu_w": 0.7237333333333335,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1667843137254903,
"calib/std_conf": 0.2545378213528175,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4945033112582781,
"calib/step_q_c_n": 906.0,
"calib/step_q_gap": 0.08121217201777181,
"calib/step_q_w": 0.4132911392405063,
"calib/step_q_w_n": 395.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1350.0,
"completions/max_terminated_length": 1350.0,
"completions/mean_length": 442.36328125,
"completions/mean_terminated_length": 444.0980529785156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.1152,
"grad_norm": 0.11303754150867462,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0309,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.040074534714221954,
"mask/share_reasoning": 0.8284124732017517,
"mask/share_step_conf": 0.12760674953460693,
"num_tokens": 25335285.0,
"reward": 0.9829171895980835,
"reward_std": 0.16700205206871033,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7740527391433716,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8527189493179321,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.5506957769393921,
"adv/mean_abs_reasoning": 0.2803175747394562,
"adv/mean_abs_step_conf": 0.7329084277153015,
"adv/ratio_final_to_reasoning": 1.9645424567162495,
"adv/ratio_step_to_reasoning": 2.6145646715034196,
"adv/std_final_conf": 0.809843122959137,
"adv/std_reasoning": 0.5960097312927246,
"adv/std_step_conf": 0.9302277565002441,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8472541507024266,
"calib/avg_num_step_conf": 6.08984375,
"calib/ece": 0.13517928286852596,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4820717131474104,
"calib/gap": 0.44353639846743287,
"calib/mean_conf": 0.6507968127490039,
"calib/mu_c": 0.8557777777777777,
"calib/mu_w": 0.4122413793103449,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12406374501992039,
"calib/std_conf": 0.3490421762289683,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44309278350515463,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.14832905425866677,
"calib/step_q_w": 0.29476372924648786,
"calib/step_q_w_n": 783.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2570.0,
"completions/max_terminated_length": 2570.0,
"completions/mean_length": 493.03125,
"completions/mean_terminated_length": 498.87750244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.09755361080169678,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0614,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03527119755744934,
"mask/share_reasoning": 0.8214081525802612,
"mask/share_step_conf": 0.1316019594669342,
"num_tokens": 25566101.0,
"reward": 1.0021953582763672,
"reward_std": 0.12378311902284622,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.8209879398345947,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.881840169429779,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.6958622336387634,
"adv/mean_abs_reasoning": 0.4584200382232666,
"adv/mean_abs_step_conf": 0.746253252029419,
"adv/ratio_final_to_reasoning": 1.5179577147974805,
"adv/ratio_step_to_reasoning": 1.6278809602689477,
"adv/std_final_conf": 0.8754823803901672,
"adv/std_reasoning": 0.720551609992981,
"adv/std_step_conf": 0.9323999881744385,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6815437286025521,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.16834645669291332,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4015748031496063,
"calib/gap": 0.204283846872082,
"calib/mean_conf": 0.6311811023622047,
"calib/mu_c": 0.7268888888888888,
"calib/mu_w": 0.5226050420168068,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.134015748031496,
"calib/std_conf": 0.3252282400819712,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4307389937106918,
"calib/step_q_c_n": 636.0,
"calib/step_q_gap": 0.07017351752021561,
"calib/step_q_w": 0.3605654761904762,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1284.0,
"completions/max_terminated_length": 1284.0,
"completions/mean_length": 410.23828125,
"completions/mean_terminated_length": 413.468505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.09466344863176346,
"learning_rate": 2.5e-06,
"loss": -0.1563,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04288126528263092,
"mask/share_reasoning": 0.8201444149017334,
"mask/share_step_conf": 0.1291617751121521,
"num_tokens": 25776042.0,
"reward": 0.9413056373596191,
"reward_std": 0.15106236934661865,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7286386489868164,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8516287803649902,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.6182411313056946,
"adv/mean_abs_reasoning": 0.4406476616859436,
"adv/mean_abs_step_conf": 0.743072509765625,
"adv/ratio_final_to_reasoning": 1.4030282810086137,
"adv/ratio_step_to_reasoning": 1.6863189672278898,
"adv/std_final_conf": 0.8267171382904053,
"adv/std_reasoning": 0.7014089226722717,
"adv/std_step_conf": 0.9295375347137451,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7810117810117809,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.09736220472440935,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.40551181102362205,
"calib/gap": 0.3560606060606062,
"calib/mean_conf": 0.5971259842519686,
"calib/mu_c": 0.7527272727272728,
"calib/mu_w": 0.3966666666666666,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06574803149606287,
"calib/std_conf": 0.3432521740559118,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4495,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.1129550561797753,
"calib/step_q_w": 0.3365449438202247,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2511.0,
"completions/max_terminated_length": 2511.0,
"completions/mean_length": 455.4453125,
"completions/mean_terminated_length": 457.2314147949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.1184,
"grad_norm": 0.13044364750385284,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.013,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.043011561036109924,
"mask/share_reasoning": 0.8237156867980957,
"mask/share_step_conf": 0.1293664574623108,
"num_tokens": 26000044.0,
"reward": 0.9977121353149414,
"reward_std": 0.10175234079360962,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.803855836391449,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.881412148475647,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.7288899421691895,
"adv/mean_abs_reasoning": 0.5192070603370667,
"adv/mean_abs_step_conf": 0.748733401298523,
"adv/ratio_final_to_reasoning": 1.4038521388672907,
"adv/ratio_step_to_reasoning": 1.4420709163940277,
"adv/std_final_conf": 0.9040296077728271,
"adv/std_reasoning": 0.7576315999031067,
"adv/std_step_conf": 0.93282151222229,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7564194473409803,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.14425702811244975,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.2971887550200803,
"calib/gap": 0.32838047445255475,
"calib/mean_conf": 0.4875502008032129,
"calib/mu_c": 0.6352554744525547,
"calib/mu_w": 0.30687499999999995,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.04080321285140557,
"calib/std_conf": 0.35301003787046775,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4341702127659575,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.12570989530564008,
"calib/step_q_w": 0.30846031746031743,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2571.0,
"completions/max_terminated_length": 2571.0,
"completions/mean_length": 480.9296875,
"completions/mean_terminated_length": 490.5099792480469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.41237401962280273,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.1422,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03407225385308266,
"mask/share_reasoning": 0.8303850889205933,
"mask/share_step_conf": 0.11601140350103378,
"num_tokens": 26231082.0,
"reward": 0.9550809860229492,
"reward_std": 0.1408146470785141,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7650078535079956,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.843591570854187,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.6568363904953003,
"adv/mean_abs_reasoning": 0.406067430973053,
"adv/mean_abs_step_conf": 0.7361575961112976,
"adv/ratio_final_to_reasoning": 1.6175549684478108,
"adv/ratio_step_to_reasoning": 1.8128949528093272,
"adv/std_final_conf": 0.8588363528251648,
"adv/std_reasoning": 0.7012441754341125,
"adv/std_step_conf": 0.9312564730644226,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7611304176775293,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.13626984126984118,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.40476190476190477,
"calib/gap": 0.3400052455576684,
"calib/mean_conf": 0.5753174603174603,
"calib/mu_c": 0.71158940397351,
"calib/mu_w": 0.3715841584158416,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05619047619047611,
"calib/std_conf": 0.35933511464338164,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44981969486823853,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.11916598462088873,
"calib/step_q_w": 0.3306537102473498,
"calib/step_q_w_n": 566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 410.3984375,
"completions/mean_terminated_length": 413.6299133300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.11442249268293381,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0525,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04052030295133591,
"mask/share_reasoning": 0.8211332559585571,
"mask/share_step_conf": 0.13053393363952637,
"num_tokens": 26441344.0,
"reward": 0.9872865676879883,
"reward_std": 0.13370364904403687,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7810617089271545,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8786677122116089,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.5734812021255493,
"adv/mean_abs_reasoning": 0.4173528850078583,
"adv/mean_abs_step_conf": 0.7532247304916382,
"adv/ratio_final_to_reasoning": 1.3740918602126144,
"adv/ratio_step_to_reasoning": 1.8047670389949642,
"adv/std_final_conf": 0.8099099397659302,
"adv/std_reasoning": 0.7013692855834961,
"adv/std_step_conf": 0.9314282536506653,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8578020429728778,
"calib/avg_num_step_conf": 5.08984375,
"calib/ece": 0.08992063492063503,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5396825396825397,
"calib/gap": 0.5081944346600915,
"calib/mean_conf": 0.6596031746031747,
"calib/mu_c": 0.8310179640718561,
"calib/mu_w": 0.3228235294117647,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.043412698412698514,
"calib/std_conf": 0.37626379802501275,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4893175853018373,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.1826817257824288,
"calib/step_q_w": 0.30663585951940847,
"calib/step_q_w_n": 541.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 437.5390625,
"completions/mean_terminated_length": 440.9842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.1216,
"grad_norm": 0.08511318266391754,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0256,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04344598576426506,
"mask/share_reasoning": 0.8190726041793823,
"mask/share_step_conf": 0.12966890633106232,
"num_tokens": 26658378.0,
"reward": 1.0297393798828125,
"reward_std": 0.13722848892211914,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.8486093878746033,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8835256099700928,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.582456648349762,
"adv/mean_abs_reasoning": 0.469838410615921,
"adv/mean_abs_step_conf": 0.763654351234436,
"adv/ratio_final_to_reasoning": 1.2396956808750637,
"adv/ratio_step_to_reasoning": 1.625355300843423,
"adv/std_final_conf": 0.8094815015792847,
"adv/std_reasoning": 0.72057044506073,
"adv/std_step_conf": 0.9315211176872253,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6543276908335995,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.2514453125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.5703125,
"calib/gap": 0.23725646758224195,
"calib/mean_conf": 0.6680078125000002,
"calib/mu_c": 0.7616129032258064,
"calib/mu_w": 0.5243564356435645,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1569921875,
"calib/std_conf": 0.38587807733008733,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.454953395472703,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.06627971126217669,
"calib/step_q_w": 0.38867368421052634,
"calib/step_q_w_n": 475.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1184.0,
"completions/max_terminated_length": 1184.0,
"completions/mean_length": 383.76171875,
"completions/mean_terminated_length": 385.2666931152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.11096165329217911,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0262,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.043351054191589355,
"mask/share_reasoning": 0.8242461085319519,
"mask/share_step_conf": 0.12849658727645874,
"num_tokens": 26861885.0,
"reward": 0.9501369595527649,
"reward_std": 0.13774867355823517,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7216605544090271,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8575196266174316,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.5877513289451599,
"adv/mean_abs_reasoning": 0.3486030697822571,
"adv/mean_abs_step_conf": 0.7445104718208313,
"adv/ratio_final_to_reasoning": 1.6860187987222217,
"adv/ratio_step_to_reasoning": 2.1356968321761025,
"adv/std_final_conf": 0.8238668441772461,
"adv/std_reasoning": 0.6402716636657715,
"adv/std_step_conf": 0.9296499490737915,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7527027027027027,
"calib/avg_num_step_conf": 6.0546875,
"calib/ece": 0.17768924302788838,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6175298804780877,
"calib/gap": 0.3859472329472329,
"calib/mean_conf": 0.705179282868526,
"calib/mu_c": 0.8758571428571429,
"calib/mu_w": 0.48990990990991,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1625498007968127,
"calib/std_conf": 0.37619327771809336,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45794158553546593,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.16778514750899182,
"calib/step_q_w": 0.2901564380264741,
"calib/step_q_w_n": 831.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2832.0,
"completions/max_terminated_length": 2832.0,
"completions/mean_length": 507.55078125,
"completions/mean_terminated_length": 509.54119873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.09604816138744354,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.079,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04052261635661125,
"mask/share_reasoning": 0.8265026807785034,
"mask/share_step_conf": 0.12906846404075623,
"num_tokens": 27096338.0,
"reward": 0.9689432382583618,
"reward_std": 0.15446476638317108,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7613726854324341,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8718262910842896,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.6196193695068359,
"adv/mean_abs_reasoning": 0.3499342203140259,
"adv/mean_abs_step_conf": 0.7461419105529785,
"adv/ratio_final_to_reasoning": 1.7706738396456299,
"adv/ratio_step_to_reasoning": 2.1322347665324117,
"adv/std_final_conf": 0.8284310698509216,
"adv/std_reasoning": 0.6401571035385132,
"adv/std_step_conf": 0.9326890110969543,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6507135826771654,
"calib/avg_num_step_conf": 5.20703125,
"calib/ece": 0.2643529411764704,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5607843137254902,
"calib/gap": 0.21378075787401607,
"calib/mean_conf": 0.673372549019608,
"calib/mu_c": 0.7798437500000001,
"calib/mu_w": 0.566062992125984,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21788235294117628,
"calib/std_conf": 0.3752546827069617,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45486356340288925,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.09852553523387514,
"calib/step_q_w": 0.3563380281690141,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2306.0,
"completions/max_terminated_length": 2306.0,
"completions/mean_length": 441.6171875,
"completions/mean_terminated_length": 441.6171875,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1248,
"grad_norm": 0.12227895110845566,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0571,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.039636991918087006,
"mask/share_reasoning": 0.8293566703796387,
"mask/share_step_conf": 0.13100633025169373,
"num_tokens": 27315992.0,
"reward": 0.9242790341377258,
"reward_std": 0.14576169848442078,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6840121150016785,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8653271794319153,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.550491452217102,
"adv/mean_abs_reasoning": 0.3719319701194763,
"adv/mean_abs_step_conf": 0.768734335899353,
"adv/ratio_final_to_reasoning": 1.4800864040815496,
"adv/ratio_step_to_reasoning": 2.0668681308907413,
"adv/std_final_conf": 0.7908481359481812,
"adv/std_reasoning": 0.6612310409545898,
"adv/std_step_conf": 0.9323161244392395,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6923175580351416,
"calib/avg_num_step_conf": 5.9765625,
"calib/ece": 0.23633466135458167,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.2926074131076054,
"calib/mean_conf": 0.7392828685258964,
"calib/mu_c": 0.8710144927536232,
"calib/mu_w": 0.5784070796460178,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21290836653386452,
"calib/std_conf": 0.3629069822341826,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4210052219321148,
"calib/step_q_c_n": 766.0,
"calib/step_q_gap": 0.10131935805776926,
"calib/step_q_w": 0.31968586387434555,
"calib/step_q_w_n": 764.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 479.65625,
"completions/mean_terminated_length": 485.3439025878906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.09555070847272873,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0926,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.038702912628650665,
"mask/share_reasoning": 0.8145185112953186,
"mask/share_step_conf": 0.13505981862545013,
"num_tokens": 27542792.0,
"reward": 0.9307032823562622,
"reward_std": 0.158748060464859,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7154749631881714,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8420253992080688,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.6382849216461182,
"adv/mean_abs_reasoning": 0.6090080738067627,
"adv/mean_abs_step_conf": 0.7536479830741882,
"adv/ratio_final_to_reasoning": 1.048073004445332,
"adv/ratio_step_to_reasoning": 1.2375008074413798,
"adv/std_final_conf": 0.8556181788444519,
"adv/std_reasoning": 0.8265742063522339,
"adv/std_step_conf": 0.9337841868400574,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7651271839671122,
"calib/avg_num_step_conf": 5.42578125,
"calib/ece": 0.19653386454183264,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5936254980079682,
"calib/gap": 0.40546056012333015,
"calib/mean_conf": 0.6560557768924303,
"calib/mu_c": 0.8369784172661873,
"calib/mu_w": 0.4315178571428571,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14940239043824696,
"calib/std_conf": 0.4104879771187675,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.452749244712991,
"calib/step_q_c_n": 662.0,
"calib/step_q_gap": 0.1888565349468287,
"calib/step_q_w": 0.2638927097661623,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2752.0,
"completions/max_terminated_length": 2752.0,
"completions/mean_length": 496.30859375,
"completions/mean_terminated_length": 502.1936950683594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.16473527252674103,
"learning_rate": 2.25e-06,
"loss": -0.1114,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03695293143391609,
"mask/share_reasoning": 0.8348743319511414,
"mask/share_step_conf": 0.11645397543907166,
"num_tokens": 27774911.0,
"reward": 0.9588329792022705,
"reward_std": 0.2034362256526947,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7515574097633362,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8629834651947021,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.5428203344345093,
"adv/mean_abs_reasoning": 0.4027462303638458,
"adv/mean_abs_step_conf": 0.7498716115951538,
"adv/ratio_final_to_reasoning": 1.3477974305162803,
"adv/ratio_step_to_reasoning": 1.8618960403867981,
"adv/std_final_conf": 0.7921384572982788,
"adv/std_reasoning": 0.701229453086853,
"adv/std_step_conf": 0.9321331977844238,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8131541077969648,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.15433070866141735,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5511811023622047,
"calib/gap": 0.49633961276818434,
"calib/mean_conf": 0.6260629921259843,
"calib/mu_c": 0.8175641025641027,
"calib/mu_w": 0.3212244897959184,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08311023622047245,
"calib/std_conf": 0.4156233483402194,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4483730715287518,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.15654071285428783,
"calib/step_q_w": 0.29183235867446394,
"calib/step_q_w_n": 513.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2591.0,
"completions/max_terminated_length": 2591.0,
"completions/mean_length": 416.72265625,
"completions/mean_terminated_length": 416.72265625,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.128,
"grad_norm": 0.08957294374704361,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0547,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04134856536984444,
"mask/share_reasoning": 0.8332849740982056,
"mask/share_step_conf": 0.1253664195537567,
"num_tokens": 27988280.0,
"reward": 1.012019157409668,
"reward_std": 0.14625471830368042,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.8189327716827393,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8847928047180176,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.6672395467758179,
"adv/mean_abs_reasoning": 0.488075315952301,
"adv/mean_abs_step_conf": 0.7569321393966675,
"adv/ratio_final_to_reasoning": 1.3670831631260498,
"adv/ratio_step_to_reasoning": 1.5508510974782455,
"adv/std_final_conf": 0.8629491925239563,
"adv/std_reasoning": 0.7574973702430725,
"adv/std_step_conf": 0.9334088563919067,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6913127413127413,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.24367588932806328,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.541501976284585,
"calib/gap": 0.3026287001287002,
"calib/mean_conf": 0.6179841897233203,
"calib/mu_c": 0.7435810810810811,
"calib/mu_w": 0.44095238095238093,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13833992094861663,
"calib/std_conf": 0.4176135472525399,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4174746008708273,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.0926760019216154,
"calib/step_q_w": 0.32479859894921187,
"calib/step_q_w_n": 571.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2678.0,
"completions/max_terminated_length": 2678.0,
"completions/mean_length": 467.4765625,
"completions/mean_terminated_length": 469.3098449707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.0753481537103653,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0361,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0383126437664032,
"mask/share_reasoning": 0.8395795822143555,
"mask/share_step_conf": 0.11820157617330551,
"num_tokens": 28213010.0,
"reward": 0.9504961967468262,
"reward_std": 0.16614177823066711,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7162296772003174,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8722625374794006,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.556941032409668,
"adv/mean_abs_reasoning": 0.43468087911605835,
"adv/mean_abs_step_conf": 0.7523335218429565,
"adv/ratio_final_to_reasoning": 1.2812641622107481,
"adv/ratio_step_to_reasoning": 1.7307720628817584,
"adv/std_final_conf": 0.7994430065155029,
"adv/std_reasoning": 0.7013107538223267,
"adv/std_step_conf": 0.9319691061973572,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8082039911308205,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.1551984126984126,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.623015873015873,
"calib/gap": 0.5038747228381373,
"calib/mean_conf": 0.6683730158730159,
"calib/mu_c": 0.8443292682926827,
"calib/mu_w": 0.3404545454545454,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08638888888888879,
"calib/std_conf": 0.4209618282685004,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45235079171741777,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.22553943946366148,
"calib/step_q_w": 0.2268113522537563,
"calib/step_q_w_n": 599.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 421.1484375,
"completions/mean_terminated_length": 426.1423034667969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.07491142302751541,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0503,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04027494415640831,
"mask/share_reasoning": 0.8144368529319763,
"mask/share_step_conf": 0.13356944918632507,
"num_tokens": 28428168.0,
"reward": 1.0099565982818604,
"reward_std": 0.1610824018716812,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.811364471912384,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8835486769676208,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.6732733845710754,
"adv/mean_abs_reasoning": 0.47695034742355347,
"adv/mean_abs_step_conf": 0.7559632062911987,
"adv/ratio_final_to_reasoning": 1.4116215413366253,
"adv/ratio_step_to_reasoning": 1.5849935121650498,
"adv/std_final_conf": 0.8682565093040466,
"adv/std_reasoning": 0.7574825882911682,
"adv/std_step_conf": 0.933620810508728,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6533776301218162,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.2892941176470589,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4745098039215686,
"calib/gap": 0.2476153562200073,
"calib/mean_conf": 0.5544705882352942,
"calib/mu_c": 0.6768217054263566,
"calib/mu_w": 0.42920634920634926,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1689411764705883,
"calib/std_conf": 0.4278780453399867,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3547577092511013,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.0827335805647742,
"calib/step_q_w": 0.2720241286863271,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1793.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 494.90234375,
"completions/mean_terminated_length": 496.8431701660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.1312,
"grad_norm": 0.08160626143217087,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0256,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03725016117095947,
"mask/share_reasoning": 0.8364986777305603,
"mask/share_step_conf": 0.12234492599964142,
"num_tokens": 28660151.0,
"reward": 0.9081982374191284,
"reward_std": 0.18566684424877167,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6762874722480774,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8424526453018188,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.5857831835746765,
"adv/mean_abs_reasoning": 0.25135308504104614,
"adv/mean_abs_step_conf": 0.7474823594093323,
"adv/ratio_final_to_reasoning": 2.330519171781869,
"adv/ratio_step_to_reasoning": 2.9738340362413607,
"adv/std_final_conf": 0.8261035084724426,
"adv/std_reasoning": 0.5725484490394592,
"adv/std_step_conf": 0.9320715069770813,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6433784501061571,
"calib/avg_num_step_conf": 5.5390625,
"calib/ece": 0.3070750988142292,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5573122529644269,
"calib/gap": 0.20498473991507438,
"calib/mean_conf": 0.6358498023715415,
"calib/mu_c": 0.7136305732484077,
"calib/mu_w": 0.5086458333333334,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16118577075098808,
"calib/std_conf": 0.4154265501233542,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40996088657105606,
"calib/step_q_c_n": 767.0,
"calib/step_q_gap": 0.1465507483222081,
"calib/step_q_w": 0.26341013824884796,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2463.0,
"completions/max_terminated_length": 2463.0,
"completions/mean_length": 440.51953125,
"completions/mean_terminated_length": 443.9881896972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.11682891100645065,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0544,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03923002630472183,
"mask/share_reasoning": 0.8280006051063538,
"mask/share_step_conf": 0.12495690584182739,
"num_tokens": 28879740.0,
"reward": 0.9176300168037415,
"reward_std": 0.1578715741634369,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6801886558532715,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8347588181495667,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.6839307546615601,
"adv/mean_abs_reasoning": 0.4573225677013397,
"adv/mean_abs_step_conf": 0.7513834238052368,
"adv/ratio_final_to_reasoning": 1.4955106154048572,
"adv/ratio_step_to_reasoning": 1.6430053464930627,
"adv/std_final_conf": 0.856546938419342,
"adv/std_reasoning": 0.7206392884254456,
"adv/std_step_conf": 0.9326479434967041,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6251910425941923,
"calib/avg_num_step_conf": 4.9609375,
"calib/ece": 0.32336,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.492,
"calib/gap": 0.17775799056415692,
"calib/mean_conf": 0.5783200000000001,
"calib/mu_c": 0.6501342281879194,
"calib/mu_w": 0.4723762376237625,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15284000000000003,
"calib/std_conf": 0.42156610110396686,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.41558869701726836,
"calib/step_q_c_n": 637.0,
"calib/step_q_gap": 0.12487779654333475,
"calib/step_q_w": 0.2907109004739336,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 474.046875,
"completions/mean_terminated_length": 479.6679992675781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.10979079455137253,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.097,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.04010496288537979,
"mask/share_reasoning": 0.8361002206802368,
"mask/share_step_conf": 0.1120760440826416,
"num_tokens": 29105904.0,
"reward": 0.8949642181396484,
"reward_std": 0.1865251064300537,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6494511365890503,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8295398354530334,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5566482543945312,
"adv/mean_abs_reasoning": 0.3620987832546234,
"adv/mean_abs_step_conf": 0.7504023909568787,
"adv/ratio_final_to_reasoning": 1.5372828635082794,
"adv/ratio_step_to_reasoning": 2.072369269545998,
"adv/std_final_conf": 0.8037400245666504,
"adv/std_reasoning": 0.6402396559715271,
"adv/std_step_conf": 0.9239445328712463,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7824463118580766,
"calib/avg_num_step_conf": 5.17578125,
"calib/ece": 0.19692913385826766,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.49606299212598426,
"calib/gap": 0.4998842203548089,
"calib/mean_conf": 0.5429133858267716,
"calib/mu_c": 0.7771111111111113,
"calib/mu_w": 0.2772268907563024,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10417322834645666,
"calib/std_conf": 0.451641119872186,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4148795180722892,
"calib/step_q_c_n": 664.0,
"calib/step_q_gap": 0.1740323168620017,
"calib/step_q_w": 0.24084720121028746,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2763.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 461.94921875,
"completions/mean_terminated_length": 461.94921875,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.1344,
"grad_norm": 0.0892636626958847,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0718,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.042710691690444946,
"mask/share_reasoning": 0.8292683362960815,
"mask/share_step_conf": 0.1280210316181183,
"num_tokens": 29329627.0,
"reward": 0.9750540256500244,
"reward_std": 0.14126716554164886,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7896147966384888,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8565868735313416,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.6591480374336243,
"adv/mean_abs_reasoning": 0.4783036410808563,
"adv/mean_abs_step_conf": 0.7303962707519531,
"adv/ratio_final_to_reasoning": 1.3780953787934818,
"adv/ratio_step_to_reasoning": 1.527055635832973,
"adv/std_final_conf": 0.8759030699729919,
"adv/std_reasoning": 0.7753259539604187,
"adv/std_step_conf": 0.9340888857841492,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7108144192256343,
"calib/avg_num_step_conf": 4.765625,
"calib/ece": 0.2846558704453441,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.41295546558704455,
"calib/gap": 0.2901962616822431,
"calib/mean_conf": 0.4972874493927126,
"calib/mu_c": 0.6230000000000001,
"calib/mu_w": 0.332803738317757,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10757085020242904,
"calib/std_conf": 0.4288431033039813,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.42916943521594675,
"calib/step_q_c_n": 602.0,
"calib/step_q_gap": 0.18598173295057455,
"calib/step_q_w": 0.2431877022653722,
"calib/step_q_w_n": 618.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2593.0,
"completions/max_terminated_length": 2593.0,
"completions/mean_length": 410.05859375,
"completions/mean_terminated_length": 414.92095947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.07838647067546844,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0798,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0423613004386425,
"mask/share_reasoning": 0.8236986398696899,
"mask/share_step_conf": 0.12222124636173248,
"num_tokens": 29538274.0,
"reward": 0.902811586856842,
"reward_std": 0.2152184695005417,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6757652163505554,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8290766477584839,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.66679847240448,
"adv/mean_abs_reasoning": 0.5108741521835327,
"adv/mean_abs_step_conf": 0.7615749835968018,
"adv/ratio_final_to_reasoning": 1.305210822576381,
"adv/ratio_step_to_reasoning": 1.4907291362104462,
"adv/std_final_conf": 0.8649638295173645,
"adv/std_reasoning": 0.7576236128807068,
"adv/std_step_conf": 0.9336161017417908,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7494089834515366,
"calib/avg_num_step_conf": 4.09375,
"calib/ece": 0.2813888888888887,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.34249377036611084,
"calib/mean_conf": 0.4142460317460317,
"calib/mu_c": 0.5651063829787234,
"calib/mu_w": 0.22261261261261256,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.06805555555555545,
"calib/std_conf": 0.42650739561700335,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4248576850094876,
"calib/step_q_c_n": 527.0,
"calib/step_q_gap": 0.15871565046054337,
"calib/step_q_w": 0.26614203454894425,
"calib/step_q_w_n": 521.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2748.0,
"completions/max_terminated_length": 2748.0,
"completions/mean_length": 442.75390625,
"completions/mean_terminated_length": 444.490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.08396448940038681,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0167,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04571385681629181,
"mask/share_reasoning": 0.8379007577896118,
"mask/share_step_conf": 0.11247918009757996,
"num_tokens": 29758283.0,
"reward": 0.911347508430481,
"reward_std": 0.18492817878723145,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6947382688522339,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8240504264831543,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.6443102955818176,
"adv/mean_abs_reasoning": 0.35985320806503296,
"adv/mean_abs_step_conf": 0.7655781507492065,
"adv/ratio_final_to_reasoning": 1.790480899270953,
"adv/ratio_step_to_reasoning": 2.1274734630428824,
"adv/std_final_conf": 0.8533403873443604,
"adv/std_reasoning": 0.6402210593223572,
"adv/std_step_conf": 0.9322566390037537,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6648866033755274,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.32421259842519684,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.32677165354330706,
"calib/gap": 0.24921809071729956,
"calib/mean_conf": 0.43492125984251967,
"calib/mu_c": 0.5291139240506328,
"calib/mu_w": 0.2798958333333333,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06854330708661417,
"calib/std_conf": 0.41206197657651583,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3711275167785235,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.08869281528598622,
"calib/step_q_w": 0.2824347014925373,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 396.50390625,
"completions/mean_terminated_length": 396.50390625,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1376,
"grad_norm": 0.10386553406715393,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0465,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04456010460853577,
"mask/share_reasoning": 0.8199877738952637,
"mask/share_step_conf": 0.13545210659503937,
"num_tokens": 29962172.0,
"reward": 0.9159624576568604,
"reward_std": 0.14984360337257385,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6719777584075928,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8372910022735596,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.6527014970779419,
"adv/mean_abs_reasoning": 0.335531085729599,
"adv/mean_abs_step_conf": 0.7392266988754272,
"adv/ratio_final_to_reasoning": 1.9452787680124137,
"adv/ratio_step_to_reasoning": 2.203154134789056,
"adv/std_final_conf": 0.8733097910881042,
"adv/std_reasoning": 0.6401881575584412,
"adv/std_step_conf": 0.932996392250061,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7235110568443903,
"calib/avg_num_step_conf": 4.13671875,
"calib/ece": 0.2717391304347826,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.383399209486166,
"calib/gap": 0.34689458689458685,
"calib/mean_conf": 0.46596837944664027,
"calib/mu_c": 0.5907407407407407,
"calib/mu_w": 0.24384615384615382,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.04869565217391305,
"calib/std_conf": 0.4331048296504483,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.41537993920972643,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.1030856748705743,
"calib/step_q_w": 0.31229426433915214,
"calib/step_q_w_n": 401.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1489.0,
"completions/max_terminated_length": 1489.0,
"completions/mean_length": 366.328125,
"completions/mean_terminated_length": 370.6719665527344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.0782543420791626,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0474,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04584982246160507,
"mask/share_reasoning": 0.8227038383483887,
"mask/share_step_conf": 0.11972758173942566,
"num_tokens": 30161240.0,
"reward": 0.9276007413864136,
"reward_std": 0.15801388025283813,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.695837140083313,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8367080688476562,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.636013925075531,
"adv/mean_abs_reasoning": 0.3565048277378082,
"adv/mean_abs_step_conf": 0.7499645352363586,
"adv/ratio_final_to_reasoning": 1.784026121360937,
"adv/ratio_step_to_reasoning": 2.1036588480308622,
"adv/std_final_conf": 0.851593554019928,
"adv/std_reasoning": 0.6611610054969788,
"adv/std_step_conf": 0.9324222207069397,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6987875726193484,
"calib/avg_num_step_conf": 4.0859375,
"calib/ece": 0.22199999999999995,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2901960784313726,
"calib/gap": 0.34479982318767366,
"calib/mean_conf": 0.3574509803921569,
"calib/mu_c": 0.557570093457944,
"calib/mu_w": 0.21277027027027026,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07992156862745095,
"calib/std_conf": 0.41659894259302566,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.41762124711316395,
"calib/step_q_c_n": 433.0,
"calib/step_q_gap": 0.1284695342257251,
"calib/step_q_w": 0.28915171288743885,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1582.0,
"completions/max_terminated_length": 1582.0,
"completions/mean_length": 392.80078125,
"completions/mean_terminated_length": 392.80078125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.07908864319324493,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0475,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0426817312836647,
"mask/share_reasoning": 0.8421868085861206,
"mask/share_step_conf": 0.11513150483369827,
"num_tokens": 30368005.0,
"reward": 0.9415676593780518,
"reward_std": 0.1567797064781189,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7366679906845093,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8659986257553101,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6498449444770813,
"adv/mean_abs_reasoning": 0.568773090839386,
"adv/mean_abs_step_conf": 0.7566512823104858,
"adv/ratio_final_to_reasoning": 1.142538131538626,
"adv/ratio_step_to_reasoning": 1.3303218708779494,
"adv/std_final_conf": 0.8433780670166016,
"adv/std_reasoning": 0.7928308248519897,
"adv/std_step_conf": 0.9342484474182129,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6983031530961125,
"calib/avg_num_step_conf": 4.203125,
"calib/ece": 0.24211764705882352,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.48627450980392156,
"calib/gap": 0.3788406989996203,
"calib/mean_conf": 0.5425882352941176,
"calib/mu_c": 0.7000671140939598,
"calib/mu_w": 0.3212264150943395,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10019607843137252,
"calib/std_conf": 0.44859821651628495,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4333222036727879,
"calib/step_q_c_n": 599.0,
"calib/step_q_gap": 0.1482068996895594,
"calib/step_q_w": 0.2851153039832285,
"calib/step_q_w_n": 477.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1399.0,
"completions/max_terminated_length": 1399.0,
"completions/mean_length": 404.47265625,
"completions/mean_terminated_length": 406.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.1408,
"grad_norm": 0.08786921203136444,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0412,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04840783774852753,
"mask/share_reasoning": 0.8275307416915894,
"mask/share_step_conf": 0.1201552003622055,
"num_tokens": 30577142.0,
"reward": 0.9508693218231201,
"reward_std": 0.18324634432792664,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7313722372055054,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.855522632598877,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7346389293670654,
"adv/mean_abs_reasoning": 0.5648310780525208,
"adv/mean_abs_step_conf": 0.7609961032867432,
"adv/ratio_final_to_reasoning": 1.3006347524290354,
"adv/ratio_step_to_reasoning": 1.3472985691767867,
"adv/std_final_conf": 0.9023372530937195,
"adv/std_reasoning": 0.8098082542419434,
"adv/std_step_conf": 0.9346030354499817,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6944772399317853,
"calib/avg_num_step_conf": 4.9296875,
"calib/ece": 0.23782608695652174,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2964426877470356,
"calib/gap": 0.3451010101010101,
"calib/mean_conf": 0.3673122529644269,
"calib/mu_c": 0.5773737373737373,
"calib/mu_w": 0.23227272727272724,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1069169960474308,
"calib/std_conf": 0.41836767465201996,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3430426356589147,
"calib/step_q_c_n": 516.0,
"calib/step_q_gap": 0.06647427104765463,
"calib/step_q_w": 0.27656836461126005,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 493.12109375,
"completions/mean_terminated_length": 493.12109375,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.07785658538341522,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.0052,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03774401545524597,
"mask/share_reasoning": 0.8516237735748291,
"mask/share_step_conf": 0.11063216626644135,
"num_tokens": 30809725.0,
"reward": 0.9364477396011353,
"reward_std": 0.20268605649471283,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.7341363430023193,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.865321695804596,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7228103876113892,
"adv/mean_abs_reasoning": 0.5332584381103516,
"adv/mean_abs_step_conf": 0.7673026323318481,
"adv/ratio_final_to_reasoning": 1.355459822019379,
"adv/ratio_step_to_reasoning": 1.4388944974801579,
"adv/std_final_conf": 0.885215163230896,
"adv/std_reasoning": 0.7577138543128967,
"adv/std_step_conf": 0.9339636564254761,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6724409945727629,
"calib/avg_num_step_conf": 4.65625,
"calib/ece": 0.3328853754940711,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.383399209486166,
"calib/gap": 0.21952984980436696,
"calib/mean_conf": 0.4512252964426877,
"calib/mu_c": 0.5501438848920863,
"calib/mu_w": 0.33061403508771936,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1173517786561265,
"calib/std_conf": 0.4389149402054845,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.39446868217054254,
"calib/step_q_c_n": 645.0,
"calib/step_q_gap": 0.11713467851423537,
"calib/step_q_w": 0.27733400365630717,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2900.0,
"completions/max_terminated_length": 2900.0,
"completions/mean_length": 475.8359375,
"completions/mean_terminated_length": 477.7019958496094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.07474420964717865,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0342,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.039326705038547516,
"mask/share_reasoning": 0.8487839698791504,
"mask/share_step_conf": 0.10798301547765732,
"num_tokens": 31040491.0,
"reward": 0.8866908550262451,
"reward_std": 0.21303297579288483,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6380894780158997,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8321672677993774,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.6431422233581543,
"adv/mean_abs_reasoning": 0.34882500767707825,
"adv/mean_abs_step_conf": 0.7356947064399719,
"adv/ratio_final_to_reasoning": 1.8437388639106336,
"adv/ratio_step_to_reasoning": 2.1090652626632633,
"adv/std_final_conf": 0.8505651354789734,
"adv/std_reasoning": 0.6403726935386658,
"adv/std_step_conf": 0.9314845204353333,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6983677767539818,
"calib/avg_num_step_conf": 4.69921875,
"calib/ece": 0.29024096385542164,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.41365461847389556,
"calib/gap": 0.28246215611425574,
"calib/mean_conf": 0.5005220883534137,
"calib/mu_c": 0.6219014084507043,
"calib/mu_w": 0.33943925233644856,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11024096385542165,
"calib/std_conf": 0.43708431602072095,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.40368770764119605,
"calib/step_q_c_n": 602.0,
"calib/step_q_gap": 0.12139153459627094,
"calib/step_q_w": 0.2822961730449251,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 456.7890625,
"completions/mean_terminated_length": 462.20556640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.144,
"grad_norm": 0.09601619839668274,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0984,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04134872555732727,
"mask/share_reasoning": 0.8315227031707764,
"mask/share_step_conf": 0.11540976911783218,
"num_tokens": 31263309.0,
"reward": 0.9178197383880615,
"reward_std": 0.17275235056877136,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6783995628356934,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8517711162567139,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6235309839248657,
"adv/mean_abs_reasoning": 0.3897348642349243,
"adv/mean_abs_step_conf": 0.7583878040313721,
"adv/ratio_final_to_reasoning": 1.5998850530062247,
"adv/ratio_step_to_reasoning": 1.9459070091667017,
"adv/std_final_conf": 0.8260871171951294,
"adv/std_reasoning": 0.6815566420555115,
"adv/std_step_conf": 0.9311226606369019,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8587307501241926,
"calib/avg_num_step_conf": 5.10546875,
"calib/ece": 0.15692913385826765,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3031496062992126,
"calib/gap": 0.5257886239443618,
"calib/mean_conf": 0.3671653543307087,
"calib/mu_c": 0.6404098360655739,
"calib/mu_w": 0.11462121212121214,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.02188976377952751,
"calib/std_conf": 0.422739281611019,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4003921568627451,
"calib/step_q_c_n": 561.0,
"calib/step_q_gap": 0.20574068233191398,
"calib/step_q_w": 0.19465147453083112,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1721.0,
"completions/max_terminated_length": 1721.0,
"completions/mean_length": 437.71875,
"completions/mean_terminated_length": 439.4353332519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.08469892293214798,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0622,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04148070514202118,
"mask/share_reasoning": 0.829554557800293,
"mask/share_step_conf": 0.12505844235420227,
"num_tokens": 31483853.0,
"reward": 0.9983916282653809,
"reward_std": 0.14055410027503967,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.8112839460372925,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8925305008888245,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6505257487297058,
"adv/mean_abs_reasoning": 0.39071184396743774,
"adv/mean_abs_step_conf": 0.7679653167724609,
"adv/ratio_final_to_reasoning": 1.664975758410644,
"adv/ratio_step_to_reasoning": 1.965554227827462,
"adv/std_final_conf": 0.8569141626358032,
"adv/std_reasoning": 0.6815319657325745,
"adv/std_step_conf": 0.9333835244178772,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.740466366591648,
"calib/avg_num_step_conf": 4.52734375,
"calib/ece": 0.24162055335968374,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3438735177865613,
"calib/gap": 0.3437815703925983,
"calib/mean_conf": 0.4397233201581028,
"calib/mu_c": 0.608217054263566,
"calib/mu_w": 0.26443548387096766,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08573122529644261,
"calib/std_conf": 0.4285663232321683,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.429008547008547,
"calib/step_q_c_n": 585.0,
"calib/step_q_gap": 0.1425102891688258,
"calib/step_q_w": 0.2864982578397212,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2550.0,
"completions/max_terminated_length": 2550.0,
"completions/mean_length": 408.57421875,
"completions/mean_terminated_length": 410.1764831542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.0840989202260971,
"learning_rate": 1.75e-06,
"loss": -0.0431,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04242321103811264,
"mask/share_reasoning": 0.8325255513191223,
"mask/share_step_conf": 0.12114499509334564,
"num_tokens": 31695432.0,
"reward": 0.9399605989456177,
"reward_std": 0.16075079143047333,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7247363328933716,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8567474484443665,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.6266717910766602,
"adv/mean_abs_reasoning": 0.5125274658203125,
"adv/mean_abs_step_conf": 0.7337398529052734,
"adv/ratio_final_to_reasoning": 1.2227086992765488,
"adv/ratio_step_to_reasoning": 1.4316107952008097,
"adv/std_final_conf": 0.8565248847007751,
"adv/std_reasoning": 0.7575325965881348,
"adv/std_step_conf": 0.9332212209701538,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7519230769230768,
"calib/avg_num_step_conf": 4.42578125,
"calib/ece": 0.2463281249999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.42578125,
"calib/gap": 0.40117948717948726,
"calib/mean_conf": 0.49546875,
"calib/mu_c": 0.6521794871794873,
"calib/mu_w": 0.251,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0662109374999999,
"calib/std_conf": 0.4419945195061105,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4335382308845577,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.1293107630733989,
"calib/step_q_w": 0.3042274678111588,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1222.0,
"completions/max_terminated_length": 1222.0,
"completions/mean_length": 412.43359375,
"completions/mean_terminated_length": 414.0509948730469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.1472,
"grad_norm": 0.11178892850875854,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0155,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04397442191839218,
"mask/share_reasoning": 0.8371366262435913,
"mask/share_step_conf": 0.11498266458511353,
"num_tokens": 31905351.0,
"reward": 0.9681872129440308,
"reward_std": 0.16365806758403778,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7441655993461609,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8711150884628296,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6846431493759155,
"adv/mean_abs_reasoning": 0.3737006187438965,
"adv/mean_abs_step_conf": 0.7485607266426086,
"adv/ratio_final_to_reasoning": 1.8320631945357129,
"adv/ratio_step_to_reasoning": 2.00310272206322,
"adv/std_final_conf": 0.8709847331047058,
"adv/std_reasoning": 0.6612383723258972,
"adv/std_step_conf": 0.9338328838348389,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7626567732615405,
"calib/avg_num_step_conf": 4.05859375,
"calib/ece": 0.2467716535433071,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.38188976377952755,
"calib/gap": 0.3783308162059231,
"calib/mean_conf": 0.4805511811023622,
"calib/mu_c": 0.6250318471337581,
"calib/mu_w": 0.24670103092783505,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05460629921259846,
"calib/std_conf": 0.4246449319686231,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4823040752351097,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.1532018308460823,
"calib/step_q_w": 0.3291022443890274,
"calib/step_q_w_n": 401.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1307.0,
"completions/max_terminated_length": 1307.0,
"completions/mean_length": 369.4140625,
"completions/mean_terminated_length": 370.8627624511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.07893967628479004,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0048,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0466064028441906,
"mask/share_reasoning": 0.8326088190078735,
"mask/share_step_conf": 0.11687853187322617,
"num_tokens": 32103017.0,
"reward": 0.9592806100845337,
"reward_std": 0.18043480813503265,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7375070452690125,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8599603176116943,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.612659215927124,
"adv/mean_abs_reasoning": 0.4621930420398712,
"adv/mean_abs_step_conf": 0.746800422668457,
"adv/ratio_final_to_reasoning": 1.3255483319765615,
"adv/ratio_step_to_reasoning": 1.615775995615343,
"adv/std_final_conf": 0.82439124584198,
"adv/std_reasoning": 0.7393609285354614,
"adv/std_step_conf": 0.9319517612457275,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7510753825541217,
"calib/avg_num_step_conf": 3.69921875,
"calib/ece": 0.23488000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.58,
"calib/gap": 0.3750906142021013,
"calib/mean_conf": 0.6427200000000001,
"calib/mu_c": 0.7732515337423312,
"calib/mu_w": 0.3981609195402299,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1128,
"calib/std_conf": 0.42086838987978176,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5261441441441441,
"calib/step_q_c_n": 555.0,
"calib/step_q_gap": 0.18042985842985848,
"calib/step_q_w": 0.34571428571428564,
"calib/step_q_w_n": 392.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 395.125,
"completions/mean_terminated_length": 399.810302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.07556366175413132,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.1017,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.046415865421295166,
"mask/share_reasoning": 0.832078218460083,
"mask/share_step_conf": 0.10978717356920242,
"num_tokens": 32309185.0,
"reward": 0.9508023858070374,
"reward_std": 0.21228066086769104,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7359312772750854,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8461422920227051,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.5549712181091309,
"adv/mean_abs_reasoning": 0.3791113495826721,
"adv/mean_abs_step_conf": 0.7596755027770996,
"adv/ratio_final_to_reasoning": 1.4638739217911736,
"adv/ratio_step_to_reasoning": 2.0038321290390138,
"adv/std_final_conf": 0.784277617931366,
"adv/std_reasoning": 0.6611729860305786,
"adv/std_step_conf": 0.932152509689331,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8491539135710915,
"calib/avg_num_step_conf": 4.5,
"calib/ece": 0.14665354330708666,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5669291338582677,
"calib/gap": 0.5754513584574936,
"calib/mean_conf": 0.625,
"calib/mu_c": 0.8311656441717793,
"calib/mu_w": 0.2557142857142857,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.06496062992125987,
"calib/std_conf": 0.43826676340502,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5248363095238094,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.26125297619047616,
"calib/step_q_w": 0.2635833333333333,
"calib/step_q_w_n": 480.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1724.0,
"completions/max_terminated_length": 1724.0,
"completions/mean_length": 424.7421875,
"completions/mean_terminated_length": 426.4078674316406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.1504,
"grad_norm": 0.07134176045656204,
"learning_rate": 1.638888888888889e-06,
"loss": 0.021,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04193870723247528,
"mask/share_reasoning": 0.8437968492507935,
"mask/share_step_conf": 0.11035820841789246,
"num_tokens": 32525015.0,
"reward": 1.0193904638290405,
"reward_std": 0.14930438995361328,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.8318593502044678,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8811401128768921,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.6422310471534729,
"adv/mean_abs_reasoning": 0.38081085681915283,
"adv/mean_abs_step_conf": 0.7638607025146484,
"adv/ratio_final_to_reasoning": 1.6864830286560568,
"adv/ratio_step_to_reasoning": 2.0058795300508097,
"adv/std_final_conf": 0.8471424579620361,
"adv/std_reasoning": 0.6814852356910706,
"adv/std_step_conf": 0.9324362874031067,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7695195195195195,
"calib/avg_num_step_conf": 4.60546875,
"calib/ece": 0.20956862745098045,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4745098039215686,
"calib/gap": 0.4343843843843845,
"calib/mean_conf": 0.5431372549019607,
"calib/mu_c": 0.7322222222222223,
"calib/mu_w": 0.29783783783783785,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09400000000000003,
"calib/std_conf": 0.442025153201775,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5328217821782179,
"calib/step_q_c_n": 606.0,
"calib/step_q_gap": 0.21055302127071346,
"calib/step_q_w": 0.3222687609075044,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1445.0,
"completions/max_terminated_length": 1445.0,
"completions/mean_length": 414.46484375,
"completions/mean_terminated_length": 416.0902099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.08121337741613388,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0221,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04288134723901749,
"mask/share_reasoning": 0.8329954147338867,
"mask/share_step_conf": 0.12021702527999878,
"num_tokens": 32736278.0,
"reward": 0.9745882153511047,
"reward_std": 0.1629941165447235,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7661937475204468,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8720451593399048,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.5666977763175964,
"adv/mean_abs_reasoning": 0.23824170231819153,
"adv/mean_abs_step_conf": 0.7360186576843262,
"adv/ratio_final_to_reasoning": 2.378667423895102,
"adv/ratio_step_to_reasoning": 3.0893779322534907,
"adv/std_final_conf": 0.7825659513473511,
"adv/std_reasoning": 0.5228584408760071,
"adv/std_step_conf": 0.9154124855995178,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8805031446540881,
"calib/avg_num_step_conf": 4.74609375,
"calib/ece": 0.12600000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.488,
"calib/gap": 0.5709067085953878,
"calib/mean_conf": 0.56488,
"calib/mu_c": 0.8069444444444445,
"calib/mu_w": 0.2360377358490566,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.05744000000000006,
"calib/std_conf": 0.43227235118614743,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5302007299270072,
"calib/step_q_c_n": 548.0,
"calib/step_q_gap": 0.3131092756541436,
"calib/step_q_w": 0.21709145427286358,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2968.0,
"completions/max_terminated_length": 2968.0,
"completions/mean_length": 430.609375,
"completions/mean_terminated_length": 434.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.07603774964809418,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0706,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.04396149888634682,
"mask/share_reasoning": 0.8335665464401245,
"mask/share_step_conf": 0.11465941369533539,
"num_tokens": 32953850.0,
"reward": 0.9956755638122559,
"reward_std": 0.1526007205247879,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.8216222524642944,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8634787201881409,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.5291182994842529,
"adv/mean_abs_reasoning": 0.2960050106048584,
"adv/mean_abs_step_conf": 0.7715482711791992,
"adv/ratio_final_to_reasoning": 1.7875315637497131,
"adv/ratio_step_to_reasoning": 2.6065378744860195,
"adv/std_final_conf": 0.7694202065467834,
"adv/std_reasoning": 0.5726398825645447,
"adv/std_step_conf": 0.9335837960243225,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7331118493909191,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.24940944881889748,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.562992125984252,
"calib/gap": 0.3708707087486158,
"calib/mean_conf": 0.6143700787401575,
"calib/mu_c": 0.7399404761904762,
"calib/mu_w": 0.36906976744186043,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10118110236220457,
"calib/std_conf": 0.44259635372771056,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47892247043363995,
"calib/step_q_c_n": 761.0,
"calib/step_q_gap": 0.14701770852887808,
"calib/step_q_w": 0.33190476190476187,
"calib/step_q_w_n": 378.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 414.06640625,
"completions/mean_terminated_length": 414.06640625,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.1536,
"grad_norm": 0.08561398833990097,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0063,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04731464385986328,
"mask/share_reasoning": 0.8295125365257263,
"mask/share_step_conf": 0.1231728121638298,
"num_tokens": 33163979.0,
"reward": 0.9652718901634216,
"reward_std": 0.1510903239250183,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7343593835830688,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8672782182693481,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.5362052917480469,
"adv/mean_abs_reasoning": 0.41883423924446106,
"adv/mean_abs_step_conf": 0.7684600353240967,
"adv/ratio_final_to_reasoning": 1.2802327066557704,
"adv/ratio_step_to_reasoning": 1.8347593470637187,
"adv/std_final_conf": 0.7784587740898132,
"adv/std_reasoning": 0.6816502213478088,
"adv/std_step_conf": 0.9331372976303101,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7162085976039464,
"calib/avg_num_step_conf": 4.46484375,
"calib/ece": 0.20179282868525894,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6055776892430279,
"calib/gap": 0.2933157152924596,
"calib/mean_conf": 0.7152589641434263,
"calib/mu_c": 0.8157575757575759,
"calib/mu_w": 0.5224418604651163,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12984063745019916,
"calib/std_conf": 0.3727803267091721,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5732499999999999,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.18438475177304964,
"calib/step_q_w": 0.3888652482269503,
"calib/step_q_w_n": 423.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2433.0,
"completions/max_terminated_length": 2433.0,
"completions/mean_length": 369.0703125,
"completions/mean_terminated_length": 371.97637939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.08192351460456848,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0497,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04904334992170334,
"mask/share_reasoning": 0.8118296265602112,
"mask/share_step_conf": 0.13131451606750488,
"num_tokens": 33361165.0,
"reward": 0.9571807980537415,
"reward_std": 0.16555394232273102,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7496456503868103,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.839715838432312,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.664141058921814,
"adv/mean_abs_reasoning": 0.46557626128196716,
"adv/mean_abs_step_conf": 0.7499189376831055,
"adv/ratio_final_to_reasoning": 1.4264925301240605,
"adv/ratio_step_to_reasoning": 1.6107327629166466,
"adv/std_final_conf": 0.8418440222740173,
"adv/std_reasoning": 0.7206533551216125,
"adv/std_step_conf": 0.9325315356254578,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7193112805272,
"calib/avg_num_step_conf": 4.53515625,
"calib/ece": 0.2748605577689242,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.601593625498008,
"calib/gap": 0.3559006331567385,
"calib/mean_conf": 0.6782868525896414,
"calib/mu_c": 0.8796330275229357,
"calib/mu_w": 0.5237323943661972,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2594422310756971,
"calib/std_conf": 0.4038083809831653,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.512928870292887,
"calib/step_q_c_n": 478.0,
"calib/step_q_gap": 0.14349255989757226,
"calib/step_q_w": 0.36943631039531477,
"calib/step_q_w_n": 683.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 418.5859375,
"completions/mean_terminated_length": 421.88189697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.10385292023420334,
"learning_rate": 1.5e-06,
"loss": -0.0516,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04161831736564636,
"mask/share_reasoning": 0.8323127627372742,
"mask/share_step_conf": 0.11825643479824066,
"num_tokens": 33575539.0,
"reward": 0.9033230543136597,
"reward_std": 0.21980994939804077,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.6927086114883423,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8334687352180481,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.5654524564743042,
"adv/mean_abs_reasoning": 0.3860156834125519,
"adv/mean_abs_step_conf": 0.7488675713539124,
"adv/ratio_final_to_reasoning": 1.4648432195175354,
"adv/ratio_step_to_reasoning": 1.9399926053096779,
"adv/std_final_conf": 0.7916289567947388,
"adv/std_reasoning": 0.6611943244934082,
"adv/std_step_conf": 0.9337374567985535,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6490112994350283,
"calib/avg_num_step_conf": 4.4609375,
"calib/ece": 0.3687109375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.69921875,
"calib/gap": 0.2368521247850648,
"calib/mean_conf": 0.7291015625000001,
"calib/mu_c": 0.8567796610169489,
"calib/mu_w": 0.6199275362318841,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31843750000000004,
"calib/std_conf": 0.4083794830608641,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5961284046692606,
"calib/step_q_c_n": 514.0,
"calib/step_q_gap": 0.15821439193040715,
"calib/step_q_w": 0.4379140127388535,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1381.0,
"completions/max_terminated_length": 1381.0,
"completions/mean_length": 401.33203125,
"completions/mean_terminated_length": 402.9059143066406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1568,
"grad_norm": 0.06466303765773773,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0024,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.042257264256477356,
"mask/share_reasoning": 0.8345401287078857,
"mask/share_step_conf": 0.11929632723331451,
"num_tokens": 33781960.0,
"reward": 0.8792563676834106,
"reward_std": 0.1607673168182373,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6305433511734009,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8357818722724915,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.61143559217453,
"adv/mean_abs_reasoning": 0.5280998945236206,
"adv/mean_abs_step_conf": 0.7383086085319519,
"adv/ratio_final_to_reasoning": 1.1578029053122298,
"adv/ratio_step_to_reasoning": 1.3980472561880604,
"adv/std_final_conf": 0.8347281217575073,
"adv/std_reasoning": 0.7753977179527283,
"adv/std_step_conf": 0.9333657622337341,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.757313829787234,
"calib/avg_num_step_conf": 4.5078125,
"calib/ece": 0.2012992125984252,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7283464566929134,
"calib/gap": 0.40677792553191483,
"calib/mean_conf": 0.7638976377952756,
"calib/mu_c": 0.9144375,
"calib/mu_w": 0.5076595744680852,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16763779527559056,
"calib/std_conf": 0.38358052742291554,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5885483870967742,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.19178991252050298,
"calib/step_q_w": 0.39675847457627117,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1849.0,
"completions/max_terminated_length": 1849.0,
"completions/mean_length": 384.1953125,
"completions/mean_terminated_length": 384.1953125,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.06249001622200012,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0046,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04991346597671509,
"mask/share_reasoning": 0.8199547529220581,
"mask/share_step_conf": 0.13013175129890442,
"num_tokens": 33985426.0,
"reward": 0.982866644859314,
"reward_std": 0.1941990852355957,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7852691411972046,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8570265769958496,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.5319117307662964,
"adv/mean_abs_reasoning": 0.39539942145347595,
"adv/mean_abs_step_conf": 0.722755491733551,
"adv/ratio_final_to_reasoning": 1.3452516668107541,
"adv/ratio_step_to_reasoning": 1.8279123653664548,
"adv/std_final_conf": 0.7791368365287781,
"adv/std_reasoning": 0.6814852356910706,
"adv/std_step_conf": 0.9184855818748474,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7100657894736842,
"calib/avg_num_step_conf": 4.72265625,
"calib/ece": 0.24849206349206357,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7857142857142857,
"calib/gap": 0.3319894736842104,
"calib/mean_conf": 0.8090476190476191,
"calib/mu_c": 0.9407894736842105,
"calib/mu_w": 0.6088000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22718253968253976,
"calib/std_conf": 0.3577862458681529,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5786814449917899,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.1967981116584565,
"calib/step_q_w": 0.38188333333333335,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 441.7734375,
"completions/mean_terminated_length": 441.7734375,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.0746147558093071,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0358,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04412522539496422,
"mask/share_reasoning": 0.8377799391746521,
"mask/share_step_conf": 0.11809486150741577,
"num_tokens": 34202976.0,
"reward": 0.9574228525161743,
"reward_std": 0.19794991612434387,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7374711036682129,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.861749529838562,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.4720293879508972,
"adv/mean_abs_reasoning": 0.4032324552536011,
"adv/mean_abs_step_conf": 0.757534384727478,
"adv/ratio_final_to_reasoning": 1.1706135798370405,
"adv/ratio_step_to_reasoning": 1.8786542969390925,
"adv/std_final_conf": 0.737794041633606,
"adv/std_reasoning": 0.6612992286682129,
"adv/std_step_conf": 0.9328404068946838,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6284599239144694,
"calib/avg_num_step_conf": 4.19921875,
"calib/ece": 0.2726086956521739,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7865612648221344,
"calib/gap": 0.2189610389610389,
"calib/mean_conf": 0.8250988142292492,
"calib/mu_c": 0.9107792207792207,
"calib/mu_w": 0.6918181818181818,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2445059288537549,
"calib/std_conf": 0.33064628611302,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5506818181818183,
"calib/step_q_c_n": 616.0,
"calib/step_q_gap": 0.13835066349772246,
"calib/step_q_w": 0.4123311546840958,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 379.26171875,
"completions/mean_terminated_length": 380.7490539550781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.16,
"grad_norm": 0.07002872973680496,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0237,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.05102251470088959,
"mask/share_reasoning": 0.8146055936813354,
"mask/share_step_conf": 0.13046567142009735,
"num_tokens": 34405027.0,
"reward": 0.9285831451416016,
"reward_std": 0.1642819046974182,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6999351382255554,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8400436639785767,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6051285862922668,
"adv/mean_abs_reasoning": 0.4240601360797882,
"adv/mean_abs_step_conf": 0.7665138244628906,
"adv/ratio_final_to_reasoning": 1.4269876718107972,
"adv/ratio_step_to_reasoning": 1.8075592569226284,
"adv/std_final_conf": 0.829787015914917,
"adv/std_reasoning": 0.7014648914337158,
"adv/std_step_conf": 0.9331311583518982,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7577142857142858,
"calib/avg_num_step_conf": 4.6328125,
"calib/ece": 0.26478087649402376,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6653386454183267,
"calib/gap": 0.39082222222222207,
"calib/mean_conf": 0.7074103585657371,
"calib/mu_c": 0.9035999999999998,
"calib/mu_w": 0.5127777777777778,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2370916334661353,
"calib/std_conf": 0.4102337140359674,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5399395161290322,
"calib/step_q_c_n": 496.0,
"calib/step_q_gap": 0.1792438639551192,
"calib/step_q_w": 0.36069565217391303,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 468.26953125,
"completions/mean_terminated_length": 471.9566955566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.06739109009504318,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04095233231782913,
"mask/share_reasoning": 0.8410570621490479,
"mask/share_step_conf": 0.11017806828022003,
"num_tokens": 34631928.0,
"reward": 0.9262844920158386,
"reward_std": 0.21015754342079163,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7189491987228394,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8398697376251221,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.6687546372413635,
"adv/mean_abs_reasoning": 0.5585987567901611,
"adv/mean_abs_step_conf": 0.778603732585907,
"adv/ratio_final_to_reasoning": 1.1972003680856431,
"adv/ratio_step_to_reasoning": 1.3938515313924897,
"adv/std_final_conf": 0.846340000629425,
"adv/std_reasoning": 0.7928636074066162,
"adv/std_step_conf": 0.9337232112884521,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6698924046603426,
"calib/avg_num_step_conf": 4.71484375,
"calib/ece": 0.32246031746031745,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7182539682539683,
"calib/gap": 0.21262749092761213,
"calib/mean_conf": 0.7651587301587303,
"calib/mu_c": 0.8605035971223023,
"calib/mu_w": 0.6478761061946902,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.26801587301587304,
"calib/std_conf": 0.3784067143582462,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.508804920913884,
"calib/step_q_c_n": 569.0,
"calib/step_q_gap": 0.10819363564742634,
"calib/step_q_w": 0.4006112852664576,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2108.0,
"completions/max_terminated_length": 2108.0,
"completions/mean_length": 406.9296875,
"completions/mean_terminated_length": 408.5255126953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.07073374837636948,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0261,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.044950634241104126,
"mask/share_reasoning": 0.8246068358421326,
"mask/share_step_conf": 0.1265362799167633,
"num_tokens": 34841494.0,
"reward": 0.8939030170440674,
"reward_std": 0.2405945062637329,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6507734060287476,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8331261873245239,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.5792268514633179,
"adv/mean_abs_reasoning": 0.39986711740493774,
"adv/mean_abs_step_conf": 0.7697999477386475,
"adv/ratio_final_to_reasoning": 1.4485483458164579,
"adv/ratio_step_to_reasoning": 1.9251394131493085,
"adv/std_final_conf": 0.7936047911643982,
"adv/std_reasoning": 0.661339282989502,
"adv/std_step_conf": 0.9338732361793518,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6221088877338877,
"calib/avg_num_step_conf": 4.56640625,
"calib/ece": 0.3182936507936508,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7857142857142857,
"calib/gap": 0.17196205821205812,
"calib/mean_conf": 0.8291666666666666,
"calib/mu_c": 0.9001351351351351,
"calib/mu_w": 0.728173076923077,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2800793650793651,
"calib/std_conf": 0.32914828158421433,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5255292259083728,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.15806653934120868,
"calib/step_q_w": 0.36746268656716413,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2259.0,
"completions/max_terminated_length": 2259.0,
"completions/mean_length": 385.21484375,
"completions/mean_terminated_length": 389.7826232910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.1632,
"grad_norm": 0.0813259556889534,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0518,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04396626353263855,
"mask/share_reasoning": 0.8241199254989624,
"mask/share_step_conf": 0.12019501626491547,
"num_tokens": 35047429.0,
"reward": 0.898368239402771,
"reward_std": 0.21193614602088928,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6636105179786682,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8206258416175842,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.582364022731781,
"adv/mean_abs_reasoning": 0.5326874256134033,
"adv/mean_abs_step_conf": 0.7495837211608887,
"adv/ratio_final_to_reasoning": 1.093256560470099,
"adv/ratio_step_to_reasoning": 1.4071736728114497,
"adv/std_final_conf": 0.7997604012489319,
"adv/std_reasoning": 0.7753660082817078,
"adv/std_step_conf": 0.9338908791542053,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6827019254813702,
"calib/avg_num_step_conf": 4.04296875,
"calib/ece": 0.33399209486166015,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7272727272727273,
"calib/gap": 0.2772130532633157,
"calib/mean_conf": 0.7707509881422925,
"calib/mu_c": 0.9120967741935483,
"calib/mu_w": 0.6348837209302326,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.307312252964427,
"calib/std_conf": 0.3743587788543868,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5527254098360657,
"calib/step_q_c_n": 488.0,
"calib/step_q_gap": 0.1330361959420986,
"calib/step_q_w": 0.4196892138939671,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 372.59375,
"completions/mean_terminated_length": 375.5275573730469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.06172942742705345,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0549,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.046777769923210144,
"mask/share_reasoning": 0.8256240487098694,
"mask/share_step_conf": 0.11978568881750107,
"num_tokens": 35247253.0,
"reward": 0.9011325836181641,
"reward_std": 0.21180510520935059,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6619023084640503,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8458315134048462,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.5934216380119324,
"adv/mean_abs_reasoning": 0.39241135120391846,
"adv/mean_abs_step_conf": 0.7745480537414551,
"adv/ratio_final_to_reasoning": 1.5122438130072287,
"adv/ratio_step_to_reasoning": 1.9738166374778425,
"adv/std_final_conf": 0.8153236508369446,
"adv/std_reasoning": 0.6814760565757751,
"adv/std_step_conf": 0.934339702129364,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6706211180124223,
"calib/avg_num_step_conf": 4.33203125,
"calib/ece": 0.3203529411764705,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6705882352941176,
"calib/gap": 0.2845248447204968,
"calib/mean_conf": 0.741529411764706,
"calib/mu_c": 0.8977391304347826,
"calib/mu_w": 0.6132142857142858,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30545098039215673,
"calib/std_conf": 0.38204867531692693,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5163991323210412,
"calib/step_q_c_n": 461.0,
"calib/step_q_gap": 0.15479419404943623,
"calib/step_q_w": 0.36160493827160495,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1271.0,
"completions/max_terminated_length": 1271.0,
"completions/mean_length": 365.33203125,
"completions/mean_terminated_length": 365.33203125,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.08029747009277344,
"learning_rate": 1.25e-06,
"loss": -0.0348,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04825538396835327,
"mask/share_reasoning": 0.8240121006965637,
"mask/share_step_conf": 0.12773250043392181,
"num_tokens": 35447994.0,
"reward": 0.8786300420761108,
"reward_std": 0.18942666053771973,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6564257740974426,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8125530481338501,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.59581458568573,
"adv/mean_abs_reasoning": 0.45483464002609253,
"adv/mean_abs_step_conf": 0.7564510107040405,
"adv/ratio_final_to_reasoning": 1.309958682240099,
"adv/ratio_step_to_reasoning": 1.6631341242185185,
"adv/std_final_conf": 0.8109592795372009,
"adv/std_reasoning": 0.701445996761322,
"adv/std_step_conf": 0.9301225543022156,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.666646123505485,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.2979215686274509,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.592156862745098,
"calib/gap": 0.26115432022679663,
"calib/mean_conf": 0.6849803921568627,
"calib/mu_c": 0.8099248120300753,
"calib/mu_w": 0.5487704918032786,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23066666666666655,
"calib/std_conf": 0.4056066620772085,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4784927066450568,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.07659573694808708,
"calib/step_q_w": 0.4018969696969697,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2247.0,
"completions/max_terminated_length": 2247.0,
"completions/mean_length": 415.80859375,
"completions/mean_terminated_length": 415.80859375,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.1664,
"grad_norm": 0.07279438525438309,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0239,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04457136243581772,
"mask/share_reasoning": 0.8282079100608826,
"mask/share_step_conf": 0.12722070515155792,
"num_tokens": 35659201.0,
"reward": 0.9135178923606873,
"reward_std": 0.18442893028259277,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6829797029495239,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.841712236404419,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6162358522415161,
"adv/mean_abs_reasoning": 0.5435482263565063,
"adv/mean_abs_step_conf": 0.7789421081542969,
"adv/ratio_final_to_reasoning": 1.133728015952231,
"adv/ratio_step_to_reasoning": 1.4330689907235548,
"adv/std_final_conf": 0.8435057401657104,
"adv/std_reasoning": 0.7753556370735168,
"adv/std_step_conf": 0.9337359666824341,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7154391974553462,
"calib/avg_num_step_conf": 4.6484375,
"calib/ece": 0.28910156249999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.69921875,
"calib/gap": 0.3077746513334968,
"calib/mean_conf": 0.7451171875000001,
"calib/mu_c": 0.8917910447761196,
"calib/mu_w": 0.5840163934426228,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.255390625,
"calib/std_conf": 0.38766325434207694,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5419318181818181,
"calib/step_q_c_n": 616.0,
"calib/step_q_gap": 0.12998059866962303,
"calib/step_q_w": 0.4119512195121951,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1550.0,
"completions/max_terminated_length": 1550.0,
"completions/mean_length": 400.73828125,
"completions/mean_terminated_length": 402.309814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.08922214806079865,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04538150504231453,
"mask/share_reasoning": 0.8202134966850281,
"mask/share_step_conf": 0.1304987668991089,
"num_tokens": 35865518.0,
"reward": 0.9309684038162231,
"reward_std": 0.19485074281692505,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7046737670898438,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8525755405426025,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.5458027124404907,
"adv/mean_abs_reasoning": 0.4660303294658661,
"adv/mean_abs_step_conf": 0.7698330283164978,
"adv/ratio_final_to_reasoning": 1.1711742303683423,
"adv/ratio_step_to_reasoning": 1.6518946936325598,
"adv/std_final_conf": 0.7610622644424438,
"adv/std_reasoning": 0.720642626285553,
"adv/std_step_conf": 0.9223465919494629,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.552269250382458,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.39011811023622045,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8228346456692913,
"calib/gap": 0.045982916879143354,
"calib/mean_conf": 0.8454724409448822,
"calib/mu_c": 0.8646621621621622,
"calib/mu_w": 0.8186792452830188,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3264566929133858,
"calib/std_conf": 0.3211365968048917,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5013793103448275,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.05773524254821738,
"calib/step_q_w": 0.44364406779661014,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2314.0,
"completions/max_terminated_length": 2314.0,
"completions/mean_length": 396.359375,
"completions/mean_terminated_length": 396.359375,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.06744744628667831,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0448,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04748845472931862,
"mask/share_reasoning": 0.8199511766433716,
"mask/share_step_conf": 0.13256040215492249,
"num_tokens": 36072226.0,
"reward": 0.8509544134140015,
"reward_std": 0.21927496790885925,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5915929675102234,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.800940752029419,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.5806899070739746,
"adv/mean_abs_reasoning": 0.3605530560016632,
"adv/mean_abs_step_conf": 0.7557182312011719,
"adv/ratio_final_to_reasoning": 1.6105532803230378,
"adv/ratio_step_to_reasoning": 2.095997298099966,
"adv/std_final_conf": 0.7915095686912537,
"adv/std_reasoning": 0.6402944922447205,
"adv/std_step_conf": 0.9314445853233337,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6462706146926538,
"calib/avg_num_step_conf": 4.24609375,
"calib/ece": 0.3015748031496062,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6850393700787402,
"calib/gap": 0.21737256371814107,
"calib/mean_conf": 0.7509448818897638,
"calib/mu_c": 0.8502173913043479,
"calib/mu_w": 0.6328448275862069,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25460629921259836,
"calib/std_conf": 0.3749896231023398,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5217921146953406,
"calib/step_q_c_n": 558.0,
"calib/step_q_gap": 0.15651801261594556,
"calib/step_q_w": 0.36527410207939504,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1620.0,
"completions/max_terminated_length": 1620.0,
"completions/mean_length": 373.90625,
"completions/mean_terminated_length": 375.37255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1696,
"grad_norm": 0.07806520909070969,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0167,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.047464147210121155,
"mask/share_reasoning": 0.8260021805763245,
"mask/share_step_conf": 0.12262741476297379,
"num_tokens": 36272730.0,
"reward": 0.9267535209655762,
"reward_std": 0.1796480417251587,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6707344055175781,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8765225410461426,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.5432534217834473,
"adv/mean_abs_reasoning": 0.5357876420021057,
"adv/mean_abs_step_conf": 0.7486876249313354,
"adv/ratio_final_to_reasoning": 1.0139342142223433,
"adv/ratio_step_to_reasoning": 1.3973588904247123,
"adv/std_final_conf": 0.7767263054847717,
"adv/std_reasoning": 0.7754621505737305,
"adv/std_step_conf": 0.9328858852386475,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7101940457203615,
"calib/avg_num_step_conf": 4.625,
"calib/ece": 0.23509960159362542,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6812749003984063,
"calib/gap": 0.33595627325890487,
"calib/mean_conf": 0.7333466135458169,
"calib/mu_c": 0.8658552631578948,
"calib/mu_w": 0.52989898989899,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18143426294820714,
"calib/std_conf": 0.3908199004657845,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5018335901386748,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.17026349668073087,
"calib/step_q_w": 0.3315700934579439,
"calib/step_q_w_n": 535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 453.4140625,
"completions/mean_terminated_length": 453.4140625,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.07252976298332214,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0478,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04366093873977661,
"mask/share_reasoning": 0.8330541849136353,
"mask/share_step_conf": 0.12328487634658813,
"num_tokens": 36493644.0,
"reward": 0.940780758857727,
"reward_std": 0.21383237838745117,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7339656352996826,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8335334062576294,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5750141143798828,
"adv/mean_abs_reasoning": 0.5041825771331787,
"adv/mean_abs_step_conf": 0.7313765287399292,
"adv/ratio_final_to_reasoning": 1.1404878717734708,
"adv/ratio_step_to_reasoning": 1.4506184106927948,
"adv/std_final_conf": 0.8224272131919861,
"adv/std_reasoning": 0.7753112316131592,
"adv/std_step_conf": 0.9324716925621033,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6812957157784743,
"calib/avg_num_step_conf": 4.03515625,
"calib/ece": 0.22353174603174608,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7698412698412699,
"calib/gap": 0.3074879832810864,
"calib/mean_conf": 0.8030555555555556,
"calib/mu_c": 0.9092121212121211,
"calib/mu_w": 0.6017241379310347,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18591269841269845,
"calib/std_conf": 0.35829815564994427,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.497725258493353,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.09244435961694847,
"calib/step_q_w": 0.4052808988764045,
"calib/step_q_w_n": 356.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1934.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 369.64453125,
"completions/mean_terminated_length": 371.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.07589954882860184,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0766,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0506252720952034,
"mask/share_reasoning": 0.8228753805160522,
"mask/share_step_conf": 0.12259312719106674,
"num_tokens": 36692193.0,
"reward": 0.9586951732635498,
"reward_std": 0.20655444264411926,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7506816387176514,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8409274816513062,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5890517234802246,
"adv/mean_abs_reasoning": 0.35327833890914917,
"adv/mean_abs_step_conf": 0.7464599609375,
"adv/ratio_final_to_reasoning": 1.6673870390669723,
"adv/ratio_step_to_reasoning": 2.112951400423855,
"adv/std_final_conf": 0.8073970079421997,
"adv/std_reasoning": 0.6402244567871094,
"adv/std_step_conf": 0.9316359162330627,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6739285714285714,
"calib/avg_num_step_conf": 4.57421875,
"calib/ece": 0.21890196078431368,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7254901960784313,
"calib/gap": 0.26711785714285696,
"calib/mean_conf": 0.7849411764705884,
"calib/mu_c": 0.868742857142857,
"calib/mu_w": 0.6016250000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15878431372549015,
"calib/std_conf": 0.3555110881459175,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4782210927573062,
"calib/step_q_c_n": 787.0,
"calib/step_q_gap": 0.08267421775730621,
"calib/step_q_w": 0.39554687499999996,
"calib/step_q_w_n": 384.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1466.0,
"completions/max_terminated_length": 1466.0,
"completions/mean_length": 369.203125,
"completions/mean_terminated_length": 370.6510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.1728,
"grad_norm": 0.07268118858337402,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0244,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04668009281158447,
"mask/share_reasoning": 0.8181484937667847,
"mask/share_step_conf": 0.13126519322395325,
"num_tokens": 36890853.0,
"reward": 0.980265200138092,
"reward_std": 0.18448534607887268,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7606140971183777,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8639787435531616,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5629448294639587,
"adv/mean_abs_reasoning": 0.37756574153900146,
"adv/mean_abs_step_conf": 0.7611972093582153,
"adv/ratio_final_to_reasoning": 1.4909849266761617,
"adv/ratio_step_to_reasoning": 2.016065351309385,
"adv/std_final_conf": 0.7932924032211304,
"adv/std_reasoning": 0.6403728723526001,
"adv/std_step_conf": 0.9302433133125305,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6921878972794304,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.25400793650793646,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5674603174603174,
"calib/gap": 0.3168916857360794,
"calib/mean_conf": 0.6593253968253967,
"calib/mu_c": 0.80268115942029,
"calib/mu_w": 0.48578947368421055,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1828571428571428,
"calib/std_conf": 0.4148770391440933,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4744969512195122,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.15307950573664614,
"calib/step_q_w": 0.32141744548286605,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2275.0,
"completions/max_terminated_length": 2275.0,
"completions/mean_length": 466.16015625,
"completions/mean_terminated_length": 466.16015625,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.08332052826881409,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0246,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04192296415567398,
"mask/share_reasoning": 0.8303929567337036,
"mask/share_step_conf": 0.12768401205539703,
"num_tokens": 37115022.0,
"reward": 0.9365798234939575,
"reward_std": 0.1679585576057434,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7094457149505615,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8598076105117798,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6199654936790466,
"adv/mean_abs_reasoning": 0.44486063718795776,
"adv/mean_abs_step_conf": 0.7613261938095093,
"adv/ratio_final_to_reasoning": 1.3936173305823536,
"adv/ratio_step_to_reasoning": 1.7113813409565428,
"adv/std_final_conf": 0.8239652514457703,
"adv/std_reasoning": 0.7205904126167297,
"adv/std_step_conf": 0.9310764074325562,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7449919484702092,
"calib/avg_num_step_conf": 4.984375,
"calib/ece": 0.2265199999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.568,
"calib/gap": 0.40511111111111126,
"calib/mean_conf": 0.63276,
"calib/mu_c": 0.8191111111111112,
"calib/mu_w": 0.414,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1596399999999999,
"calib/std_conf": 0.43284914508405814,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4530561056105611,
"calib/step_q_c_n": 606.0,
"calib/step_q_gap": 0.16538446381951633,
"calib/step_q_w": 0.28767164179104476,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2548.0,
"completions/max_terminated_length": 2548.0,
"completions/mean_length": 465.44140625,
"completions/mean_terminated_length": 467.2666931152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.07369590550661087,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0226,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.040351904928684235,
"mask/share_reasoning": 0.8398116827011108,
"mask/share_step_conf": 0.11593015491962433,
"num_tokens": 37340311.0,
"reward": 0.9514877796173096,
"reward_std": 0.1814444661140442,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7391566038131714,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8630375862121582,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5729429125785828,
"adv/mean_abs_reasoning": 0.3981139659881592,
"adv/mean_abs_step_conf": 0.7356992959976196,
"adv/ratio_final_to_reasoning": 1.439142962886219,
"adv/ratio_step_to_reasoning": 1.8479615357666226,
"adv/std_final_conf": 0.8126264214515686,
"adv/std_reasoning": 0.6815700531005859,
"adv/std_step_conf": 0.9325501322746277,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7445909614646012,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.28335968379446635,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6324110671936759,
"calib/gap": 0.4067065676609908,
"calib/mean_conf": 0.6860474308300396,
"calib/mu_c": 0.9207476635514018,
"calib/mu_w": 0.514041095890411,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2732411067193675,
"calib/std_conf": 0.41896563496773,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48675381263616546,
"calib/step_q_c_n": 459.0,
"calib/step_q_gap": 0.1439782482271349,
"calib/step_q_w": 0.34277556440903056,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2636.0,
"completions/max_terminated_length": 2636.0,
"completions/mean_length": 451.30078125,
"completions/mean_terminated_length": 453.07061767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.176,
"grad_norm": 0.07571450620889664,
"learning_rate": 9.722222222222224e-07,
"loss": -0.013,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04123295098543167,
"mask/share_reasoning": 0.8342355489730835,
"mask/share_step_conf": 0.12062521278858185,
"num_tokens": 37561420.0,
"reward": 0.9152418375015259,
"reward_std": 0.1929076462984085,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.697473406791687,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8525415658950806,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.5445481538772583,
"adv/mean_abs_reasoning": 0.42633116245269775,
"adv/mean_abs_step_conf": 0.7497313618659973,
"adv/ratio_final_to_reasoning": 1.2772891166211124,
"adv/ratio_step_to_reasoning": 1.758565706416503,
"adv/std_final_conf": 0.7711051106452942,
"adv/std_reasoning": 0.6816762089729309,
"adv/std_step_conf": 0.9312403798103333,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8083285385500576,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.14529411764705874,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5843137254901961,
"calib/gap": 0.5199769850402762,
"calib/mean_conf": 0.658,
"calib/mu_c": 0.8190909090909091,
"calib/mu_w": 0.2991139240506328,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.05654901960784307,
"calib/std_conf": 0.421354864180571,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4063033707865168,
"calib/step_q_c_n": 890.0,
"calib/step_q_gap": 0.168904650104214,
"calib/step_q_w": 0.2373987206823028,
"calib/step_q_w_n": 469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1501.0,
"completions/max_terminated_length": 1501.0,
"completions/mean_length": 444.359375,
"completions/mean_terminated_length": 446.10198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.07834941148757935,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0351,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04008059576153755,
"mask/share_reasoning": 0.8259658813476562,
"mask/share_step_conf": 0.1300472915172577,
"num_tokens": 37781360.0,
"reward": 1.0240976810455322,
"reward_std": 0.14218752086162567,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.8227410316467285,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8902982473373413,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.4789263904094696,
"adv/mean_abs_reasoning": 0.36526644229888916,
"adv/mean_abs_step_conf": 0.7769738435745239,
"adv/ratio_final_to_reasoning": 1.3111699705979973,
"adv/ratio_step_to_reasoning": 2.1271426925628827,
"adv/std_final_conf": 0.7239590287208557,
"adv/std_reasoning": 0.6403273940086365,
"adv/std_step_conf": 0.9335882663726807,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6627229080932785,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.2313888888888887,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.753968253968254,
"calib/gap": 0.2902098765432096,
"calib/mean_conf": 0.80234126984127,
"calib/mu_c": 0.9059876543209875,
"calib/mu_w": 0.6157777777777779,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19543650793650777,
"calib/std_conf": 0.3505682385557793,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.446627358490566,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.1079097114317425,
"calib/step_q_w": 0.3387176470588235,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2767.0,
"completions/max_terminated_length": 2767.0,
"completions/mean_length": 431.51171875,
"completions/mean_terminated_length": 433.2039489746094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.06649410724639893,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0577,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.042462125420570374,
"mask/share_reasoning": 0.82489013671875,
"mask/share_step_conf": 0.12874150276184082,
"num_tokens": 37997435.0,
"reward": 0.9571617841720581,
"reward_std": 0.1660226434469223,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7435324192047119,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8473536372184753,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5585249066352844,
"adv/mean_abs_reasoning": 0.4548640847206116,
"adv/mean_abs_step_conf": 0.7345359921455383,
"adv/ratio_final_to_reasoning": 1.2278940575806152,
"adv/ratio_step_to_reasoning": 1.6148471968207996,
"adv/std_final_conf": 0.7788551449775696,
"adv/std_reasoning": 0.7205998301506042,
"adv/std_step_conf": 0.9329665899276733,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6898065623555819,
"calib/avg_num_step_conf": 4.77734375,
"calib/ece": 0.2560714285714286,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6309523809523809,
"calib/gap": 0.28005347593582886,
"calib/mean_conf": 0.7059920634920636,
"calib/mu_c": 0.8160130718954248,
"calib/mu_w": 0.535959595959596,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17746031746031748,
"calib/std_conf": 0.3982075512911148,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4612481426448736,
"calib/step_q_c_n": 673.0,
"calib/step_q_gap": 0.1121208699176009,
"calib/step_q_w": 0.34912727272727273,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2865.0,
"completions/max_terminated_length": 2865.0,
"completions/mean_length": 481.0859375,
"completions/mean_terminated_length": 482.9725646972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.1792,
"grad_norm": 0.07671131938695908,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03883073478937149,
"mask/share_reasoning": 0.8422915935516357,
"mask/share_step_conf": 0.11497138440608978,
"num_tokens": 38225265.0,
"reward": 0.9440748691558838,
"reward_std": 0.18650661408901215,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7153031229972839,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8572216033935547,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.517835259437561,
"adv/mean_abs_reasoning": 0.35380783677101135,
"adv/mean_abs_step_conf": 0.75236976146698,
"adv/ratio_final_to_reasoning": 1.4636059623877415,
"adv/ratio_step_to_reasoning": 2.1264926416933005,
"adv/std_final_conf": 0.7613320350646973,
"adv/std_reasoning": 0.640367329120636,
"adv/std_step_conf": 0.93329256772995,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6736637512147716,
"calib/avg_num_step_conf": 4.41015625,
"calib/ece": 0.265595238095238,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.626984126984127,
"calib/gap": 0.2759591836734695,
"calib/mean_conf": 0.6991666666666666,
"calib/mu_c": 0.8141496598639457,
"calib/mu_w": 0.5381904761904762,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.19071428571428564,
"calib/std_conf": 0.4071997140751171,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.46905582922824296,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.1609019830743968,
"calib/step_q_w": 0.30815384615384617,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2471.0,
"completions/max_terminated_length": 2471.0,
"completions/mean_length": 429.30859375,
"completions/mean_terminated_length": 430.9921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.09371823817491531,
"learning_rate": 8.611111111111112e-07,
"loss": 0.0385,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.042047228664159775,
"mask/share_reasoning": 0.8362818956375122,
"mask/share_step_conf": 0.1177646592259407,
"num_tokens": 38439352.0,
"reward": 0.9178427457809448,
"reward_std": 0.15940846502780914,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6946808099746704,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8308483362197876,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5378715395927429,
"adv/mean_abs_reasoning": 0.3950071334838867,
"adv/mean_abs_step_conf": 0.743911623954773,
"adv/ratio_final_to_reasoning": 1.3616755091201005,
"adv/ratio_step_to_reasoning": 1.883286555849298,
"adv/std_final_conf": 0.7942361235618591,
"adv/std_reasoning": 0.6612808704376221,
"adv/std_step_conf": 0.9310983419418335,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7955873662092062,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.15759448818897626,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5590551181102362,
"calib/gap": 0.455518221813645,
"calib/mean_conf": 0.6541377952755906,
"calib/mu_c": 0.8280955414012738,
"calib/mu_w": 0.37257731958762885,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09681102362204712,
"calib/std_conf": 0.4139508709160388,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42593333333333333,
"calib/step_q_c_n": 795.0,
"calib/step_q_gap": 0.13033297491039425,
"calib/step_q_w": 0.2956003584229391,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 466.9609375,
"completions/mean_terminated_length": 466.9609375,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.07489202171564102,
"learning_rate": 8.333333333333333e-07,
"loss": -0.034,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03827427700161934,
"mask/share_reasoning": 0.8382338881492615,
"mask/share_step_conf": 0.1234918087720871,
"num_tokens": 38663046.0,
"reward": 0.9949482679367065,
"reward_std": 0.16509473323822021,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.79248046875,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8771034479141235,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.6157266497612,
"adv/mean_abs_reasoning": 0.4606185257434845,
"adv/mean_abs_step_conf": 0.7603050470352173,
"adv/ratio_final_to_reasoning": 1.3367387878447907,
"adv/ratio_step_to_reasoning": 1.6506176033801694,
"adv/std_final_conf": 0.8095325231552124,
"adv/std_reasoning": 0.7015524506568909,
"adv/std_step_conf": 0.9335606694221497,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.729532722179781,
"calib/avg_num_step_conf": 5.60546875,
"calib/ece": 0.27175298804780873,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5179282868525896,
"calib/gap": 0.3450993124522532,
"calib/mean_conf": 0.5895219123505976,
"calib/mu_c": 0.7710084033613441,
"calib/mu_w": 0.4259090909090909,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19358565737051786,
"calib/std_conf": 0.44340022045094263,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44571701720841295,
"calib/step_q_c_n": 523.0,
"calib/step_q_gap": 0.1940722803663077,
"calib/step_q_w": 0.25164473684210525,
"calib/step_q_w_n": 912.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2285.0,
"completions/max_terminated_length": 2285.0,
"completions/mean_length": 421.97265625,
"completions/mean_terminated_length": 428.670654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.1824,
"grad_norm": 0.07304099947214127,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0313,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.040931448340415955,
"mask/share_reasoning": 0.8156298398971558,
"mask/share_step_conf": 0.12781374156475067,
"num_tokens": 38877967.0,
"reward": 0.9164899587631226,
"reward_std": 0.20652560889720917,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.691113293170929,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.854366660118103,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.5187068581581116,
"adv/mean_abs_reasoning": 0.3769237995147705,
"adv/mean_abs_step_conf": 0.7414641976356506,
"adv/ratio_final_to_reasoning": 1.3761584140504373,
"adv/ratio_step_to_reasoning": 1.9671461409180528,
"adv/std_final_conf": 0.7765347361564636,
"adv/std_reasoning": 0.681504487991333,
"adv/std_step_conf": 0.9323990941047668,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7505080563216723,
"calib/avg_num_step_conf": 4.53515625,
"calib/ece": 0.18481927710843388,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.642570281124498,
"calib/gap": 0.34192771084337337,
"calib/mean_conf": 0.750120481927711,
"calib/mu_c": 0.8640963855421686,
"calib/mu_w": 0.5221686746987952,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13413654618473908,
"calib/std_conf": 0.3591751839887356,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4400571428571428,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.09751918190269604,
"calib/step_q_w": 0.3425379609544468,
"calib/step_q_w_n": 461.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2572.0,
"completions/max_terminated_length": 2572.0,
"completions/mean_length": 403.46875,
"completions/mean_terminated_length": 408.25299072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.07457685470581055,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04320889711380005,
"mask/share_reasoning": 0.8211055397987366,
"mask/share_step_conf": 0.12396682798862457,
"num_tokens": 39084607.0,
"reward": 0.9766821265220642,
"reward_std": 0.17918381094932556,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7681636810302734,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8617630004882812,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.5529680848121643,
"adv/mean_abs_reasoning": 0.365217924118042,
"adv/mean_abs_step_conf": 0.7145917415618896,
"adv/ratio_final_to_reasoning": 1.5140770709639093,
"adv/ratio_step_to_reasoning": 1.9566173902541724,
"adv/std_final_conf": 0.7967362999916077,
"adv/std_reasoning": 0.6402485966682434,
"adv/std_step_conf": 0.9333781599998474,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6749213836477987,
"calib/avg_num_step_conf": 4.94921875,
"calib/ece": 0.24749019607843145,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7019607843137254,
"calib/gap": 0.2984944968553457,
"calib/mean_conf": 0.7567450980392157,
"calib/mu_c": 0.8691194968553457,
"calib/mu_w": 0.570625,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19035294117647067,
"calib/std_conf": 0.3834241926321011,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4593935643564356,
"calib/step_q_c_n": 808.0,
"calib/step_q_gap": 0.05015609158955109,
"calib/step_q_w": 0.4092374727668845,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1285.0,
"completions/max_terminated_length": 1285.0,
"completions/mean_length": 432.22265625,
"completions/mean_terminated_length": 433.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.06947637349367142,
"learning_rate": 7.5e-07,
"loss": -0.0283,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04602811485528946,
"mask/share_reasoning": 0.820665717124939,
"mask/share_step_conf": 0.12939989566802979,
"num_tokens": 39298416.0,
"reward": 0.9457964897155762,
"reward_std": 0.18005359172821045,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.73384690284729,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8350897431373596,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.6815347671508789,
"adv/mean_abs_reasoning": 0.5925767421722412,
"adv/mean_abs_step_conf": 0.7791270613670349,
"adv/ratio_final_to_reasoning": 1.1501206825170684,
"adv/ratio_step_to_reasoning": 1.314812084103986,
"adv/std_final_conf": 0.859148383140564,
"adv/std_reasoning": 0.7929279804229736,
"adv/std_step_conf": 0.9338409304618835,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6755434782608695,
"calib/avg_num_step_conf": 5.37109375,
"calib/ece": 0.2848207171314741,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.44621513944223107,
"calib/gap": 0.29682608695652174,
"calib/mean_conf": 0.540996015936255,
"calib/mu_c": 0.7018260869565217,
"calib/mu_w": 0.40499999999999997,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18382470119521915,
"calib/std_conf": 0.44170193748471687,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.43251327433628317,
"calib/step_q_c_n": 565.0,
"calib/step_q_gap": 0.13611697803998685,
"calib/step_q_w": 0.2963962962962963,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 496.765625,
"completions/mean_terminated_length": 496.765625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.1856,
"grad_norm": 0.07883763313293457,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0019,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03562241047620773,
"mask/share_reasoning": 0.8468090891838074,
"mask/share_step_conf": 0.1175684779882431,
"num_tokens": 39529820.0,
"reward": 0.8818709850311279,
"reward_std": 0.20740194618701935,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6686819791793823,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8122473955154419,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6320722103118896,
"adv/mean_abs_reasoning": 0.3945220112800598,
"adv/mean_abs_step_conf": 0.779804527759552,
"adv/ratio_final_to_reasoning": 1.6021215350217806,
"adv/ratio_step_to_reasoning": 1.976580534073145,
"adv/std_final_conf": 0.8205243945121765,
"adv/std_reasoning": 0.6613585948944092,
"adv/std_step_conf": 0.9341087937355042,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7647095959595961,
"calib/avg_num_step_conf": 4.8046875,
"calib/ece": 0.20841269841269844,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4603174603174603,
"calib/gap": 0.41150757575757585,
"calib/mean_conf": 0.5643650793650795,
"calib/mu_c": 0.7799166666666667,
"calib/mu_w": 0.36840909090909085,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1482936507936508,
"calib/std_conf": 0.4328752084401884,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.49258849557522116,
"calib/step_q_c_n": 452.0,
"calib/step_q_gap": 0.1995396523875605,
"calib/step_q_w": 0.29304884318766067,
"calib/step_q_w_n": 778.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2488.0,
"completions/max_terminated_length": 2488.0,
"completions/mean_length": 447.453125,
"completions/mean_terminated_length": 449.2078552246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.07463081926107407,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0688,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04283197969198227,
"mask/share_reasoning": 0.8326044082641602,
"mask/share_step_conf": 0.12065736949443817,
"num_tokens": 39750192.0,
"reward": 0.9311752319335938,
"reward_std": 0.19330263137817383,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7413758039474487,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8319121599197388,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6272318363189697,
"adv/mean_abs_reasoning": 0.4546269178390503,
"adv/mean_abs_step_conf": 0.751192569732666,
"adv/ratio_final_to_reasoning": 1.3796627777790889,
"adv/ratio_step_to_reasoning": 1.6523275245189235,
"adv/std_final_conf": 0.8240246772766113,
"adv/std_reasoning": 0.7014360427856445,
"adv/std_step_conf": 0.933167040348053,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6834124723013613,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.2514682539682539,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5198412698412699,
"calib/gap": 0.279994301994302,
"calib/mean_conf": 0.6372619047619049,
"calib/mu_c": 0.7672592592592593,
"calib/mu_w": 0.48726495726495733,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.17650793650793647,
"calib/std_conf": 0.40794593243081617,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4296898079763663,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.09962611370885033,
"calib/step_q_w": 0.330063694267516,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1705.0,
"completions/max_terminated_length": 1705.0,
"completions/mean_length": 416.734375,
"completions/mean_terminated_length": 423.3492431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.09122262895107269,
"learning_rate": 6.666666666666667e-07,
"loss": -0.1328,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.044397637248039246,
"mask/share_reasoning": 0.8054189085960388,
"mask/share_step_conf": 0.13455848395824432,
"num_tokens": 39960940.0,
"reward": 0.9191524982452393,
"reward_std": 0.187923401594162,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6974878311157227,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8408170938491821,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.6960560083389282,
"adv/mean_abs_reasoning": 0.40270107984542847,
"adv/mean_abs_step_conf": 0.7754931449890137,
"adv/ratio_final_to_reasoning": 1.7284681943393352,
"adv/ratio_step_to_reasoning": 1.925728993045354,
"adv/std_final_conf": 0.8730778694152832,
"adv/std_reasoning": 0.6815537810325623,
"adv/std_step_conf": 0.9336084127426147,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7267478746351985,
"calib/avg_num_step_conf": 5.48828125,
"calib/ece": 0.2518142292490118,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4901185770750988,
"calib/gap": 0.3365333079558433,
"calib/mean_conf": 0.591505928853755,
"calib/mu_c": 0.739154929577465,
"calib/mu_w": 0.4026216216216217,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1410276679841897,
"calib/std_conf": 0.42191286233484737,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42364583333333333,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.15325156321055028,
"calib/step_q_w": 0.27039427012278305,
"calib/step_q_w_n": 733.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1732.0,
"completions/max_terminated_length": 1732.0,
"completions/mean_length": 454.7734375,
"completions/mean_terminated_length": 458.3543395996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1888,
"grad_norm": 0.08699323982000351,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0555,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04031483829021454,
"mask/share_reasoning": 0.8218288421630859,
"mask/share_step_conf": 0.13004378974437714,
"num_tokens": 40181194.0,
"reward": 0.9515196084976196,
"reward_std": 0.18108904361724854,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7318902015686035,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8625553846359253,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6604294180870056,
"adv/mean_abs_reasoning": 0.5337682962417603,
"adv/mean_abs_step_conf": 0.7507667541503906,
"adv/ratio_final_to_reasoning": 1.2372960753515354,
"adv/ratio_step_to_reasoning": 1.4065405522143357,
"adv/std_final_conf": 0.8525503277778625,
"adv/std_reasoning": 0.7754141688346863,
"adv/std_step_conf": 0.9337341785430908,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8208227040816326,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.15789682539682534,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.503968253968254,
"calib/gap": 0.48300000000000004,
"calib/mean_conf": 0.6051190476190477,
"calib/mu_c": 0.8197857142857143,
"calib/mu_w": 0.3367857142857143,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10373015873015867,
"calib/std_conf": 0.4244150803527591,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42572815533980585,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.157339997445069,
"calib/step_q_w": 0.26838815789473686,
"calib/step_q_w_n": 608.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1531.0,
"completions/max_terminated_length": 1531.0,
"completions/mean_length": 414.16015625,
"completions/mean_terminated_length": 417.4212646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.08844917267560959,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0594,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04261164367198944,
"mask/share_reasoning": 0.821635365486145,
"mask/share_step_conf": 0.12794049084186554,
"num_tokens": 40393291.0,
"reward": 0.9865151643753052,
"reward_std": 0.1949872523546219,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7963793277740479,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8704010844230652,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.642683207988739,
"adv/mean_abs_reasoning": 0.5560048222541809,
"adv/mean_abs_step_conf": 0.7505338191986084,
"adv/ratio_final_to_reasoning": 1.1558950251243192,
"adv/ratio_step_to_reasoning": 1.3498692622048831,
"adv/std_final_conf": 0.8355764150619507,
"adv/std_reasoning": 0.7754384279251099,
"adv/std_step_conf": 0.9327178001403809,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7686349206349206,
"calib/avg_num_step_conf": 4.78125,
"calib/ece": 0.21637450199203184,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5059760956175299,
"calib/gap": 0.42960698412698395,
"calib/mean_conf": 0.5848207171314742,
"calib/mu_c": 0.8004799999999999,
"calib/mu_w": 0.3708730158730159,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15159362549800792,
"calib/std_conf": 0.4404573272187454,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.451613475177305,
"calib/step_q_c_n": 564.0,
"calib/step_q_gap": 0.1341286266924565,
"calib/step_q_w": 0.3174848484848485,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2786.0,
"completions/max_terminated_length": 2786.0,
"completions/mean_length": 450.55859375,
"completions/mean_terminated_length": 452.3255310058594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.06448730826377869,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0308,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03963598608970642,
"mask/share_reasoning": 0.8372147083282471,
"mask/share_step_conf": 0.11924304068088531,
"num_tokens": 40614898.0,
"reward": 0.9538394212722778,
"reward_std": 0.18762339651584625,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7483574151992798,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8655713796615601,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.6583120226860046,
"adv/mean_abs_reasoning": 0.451434850692749,
"adv/mean_abs_step_conf": 0.7848700284957886,
"adv/ratio_final_to_reasoning": 1.4582658420717682,
"adv/ratio_step_to_reasoning": 1.7386119553937116,
"adv/std_final_conf": 0.8509746789932251,
"adv/std_reasoning": 0.7206116318702698,
"adv/std_step_conf": 0.9331481456756592,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7350760955129887,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.2431075697211155,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4940239043824701,
"calib/gap": 0.3431251639989503,
"calib/mean_conf": 0.5827091633466135,
"calib/mu_c": 0.7235135135135136,
"calib/mu_w": 0.38038834951456324,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1180876494023904,
"calib/std_conf": 0.4357882347351325,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4254787234042553,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.07888059256313379,
"calib/step_q_w": 0.3465981308411215,
"calib/step_q_w_n": 535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1827.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 459.5859375,
"completions/mean_terminated_length": 465.03558349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.192,
"grad_norm": 0.09744112193584442,
"learning_rate": 5.555555555555555e-07,
"loss": -0.1025,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037247009575366974,
"mask/share_reasoning": 0.8301904201507568,
"mask/share_step_conf": 0.1208437904715538,
"num_tokens": 40836408.0,
"reward": 0.9185820817947388,
"reward_std": 0.1872258186340332,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7002886533737183,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8290631175041199,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6360911130905151,
"adv/mean_abs_reasoning": 0.3917732238769531,
"adv/mean_abs_step_conf": 0.756089448928833,
"adv/ratio_final_to_reasoning": 1.6236206925930614,
"adv/ratio_step_to_reasoning": 1.9299160913720412,
"adv/std_final_conf": 0.8453565835952759,
"adv/std_reasoning": 0.7012593150138855,
"adv/std_step_conf": 0.9292190670967102,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7270427286356822,
"calib/avg_num_step_conf": 4.4453125,
"calib/ece": 0.2603543307086615,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4448818897637795,
"calib/gap": 0.3485044977511243,
"calib/mean_conf": 0.5526377952755905,
"calib/mu_c": 0.7419827586206895,
"calib/mu_w": 0.3934782608695652,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17814960629921267,
"calib/std_conf": 0.43033557989948107,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48066921606118546,
"calib/step_q_c_n": 523.0,
"calib/step_q_gap": 0.12810011037012858,
"calib/step_q_w": 0.35256910569105687,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1371.0,
"completions/max_terminated_length": 1371.0,
"completions/mean_length": 388.63671875,
"completions/mean_terminated_length": 391.69683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.0824630856513977,
"learning_rate": 5.277777777777779e-07,
"loss": -0.087,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0428725928068161,
"mask/share_reasoning": 0.8240259289741516,
"mask/share_step_conf": 0.12528899312019348,
"num_tokens": 41042163.0,
"reward": 0.9327545166015625,
"reward_std": 0.16425344347953796,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.7208136320114136,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8564140796661377,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.587517499923706,
"adv/mean_abs_reasoning": 0.4299156665802002,
"adv/mean_abs_step_conf": 0.7061957716941833,
"adv/ratio_final_to_reasoning": 1.3665877882449893,
"adv/ratio_step_to_reasoning": 1.6426379092245607,
"adv/std_final_conf": 0.8210249543190002,
"adv/std_reasoning": 0.7205567359924316,
"adv/std_step_conf": 0.9328953623771667,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7631914180355348,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.20908730158730154,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5634920634920635,
"calib/gap": 0.3618618840093866,
"calib/mean_conf": 0.6721825396825396,
"calib/mu_c": 0.8085987261146498,
"calib/mu_w": 0.44673684210526315,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1291269841269841,
"calib/std_conf": 0.3994117678913053,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4477542932628798,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.14672990301897737,
"calib/step_q_w": 0.30102439024390243,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2968.0,
"completions/max_terminated_length": 2968.0,
"completions/mean_length": 453.52734375,
"completions/mean_terminated_length": 455.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.08231616765260696,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0436,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.039243005216121674,
"mask/share_reasoning": 0.827210009098053,
"mask/share_step_conf": 0.12964074313640594,
"num_tokens": 41264426.0,
"reward": 0.9717744588851929,
"reward_std": 0.17918717861175537,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7575777173042297,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8672209978103638,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.5640467405319214,
"adv/mean_abs_reasoning": 0.4868428111076355,
"adv/mean_abs_step_conf": 0.7534921169281006,
"adv/ratio_final_to_reasoning": 1.1585808143056198,
"adv/ratio_step_to_reasoning": 1.5477112935359991,
"adv/std_final_conf": 0.8089180588722229,
"adv/std_reasoning": 0.7206279039382935,
"adv/std_step_conf": 0.9286974668502808,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7397829457364341,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.23244094488188966,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.468503937007874,
"calib/gap": 0.3943975193798452,
"calib/mean_conf": 0.545984251968504,
"calib/mu_c": 0.7400775193798451,
"calib/mu_w": 0.34567999999999993,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13527559055118102,
"calib/std_conf": 0.44471128357199513,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42944636678200687,
"calib/step_q_c_n": 578.0,
"calib/step_q_gap": 0.12737302404575862,
"calib/step_q_w": 0.30207334273624825,
"calib/step_q_w_n": 709.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2018.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 472.8125,
"completions/mean_terminated_length": 474.66668701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.1952,
"grad_norm": 0.07106750458478928,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0104,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03746788948774338,
"mask/share_reasoning": 0.8460267186164856,
"mask/share_step_conf": 0.11259913444519043,
"num_tokens": 41492146.0,
"reward": 0.9547048807144165,
"reward_std": 0.15090668201446533,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.74214768409729,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8680433034896851,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.574894905090332,
"adv/mean_abs_reasoning": 0.480303019285202,
"adv/mean_abs_step_conf": 0.7650856375694275,
"adv/ratio_final_to_reasoning": 1.1969421011466965,
"adv/ratio_step_to_reasoning": 1.592922815076294,
"adv/std_final_conf": 0.7902071475982666,
"adv/std_reasoning": 0.7392838001251221,
"adv/std_step_conf": 0.9323039650917053,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.778040810882902,
"calib/avg_num_step_conf": 4.8515625,
"calib/ece": 0.18841176470588228,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6862745098039216,
"calib/gap": 0.4496585756201654,
"calib/mean_conf": 0.7302549019607845,
"calib/mu_c": 0.8924846625766872,
"calib/mu_w": 0.4428260869565218,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13972549019607838,
"calib/std_conf": 0.4014098117706824,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46454735376044565,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.15134124689021666,
"calib/step_q_w": 0.313206106870229,
"calib/step_q_w_n": 524.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1090.0,
"completions/max_terminated_length": 1090.0,
"completions/mean_length": 394.96484375,
"completions/mean_terminated_length": 396.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.07112251967191696,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0243,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04334566369652748,
"mask/share_reasoning": 0.8269565105438232,
"mask/share_step_conf": 0.12579162418842316,
"num_tokens": 41698537.0,
"reward": 1.0030958652496338,
"reward_std": 0.1829470843076706,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.8042088747024536,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8754204511642456,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.5484204888343811,
"adv/mean_abs_reasoning": 0.4035857021808624,
"adv/mean_abs_step_conf": 0.7365462183952332,
"adv/ratio_final_to_reasoning": 1.3588699645970426,
"adv/ratio_step_to_reasoning": 1.8250057284367278,
"adv/std_final_conf": 0.7685003876686096,
"adv/std_reasoning": 0.6613385081291199,
"adv/std_step_conf": 0.9303016662597656,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7063632781717888,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.24650602409638556,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6024096385542169,
"calib/gap": 0.32946611505122125,
"calib/mean_conf": 0.6636947791164659,
"calib/mu_c": 0.806595744680851,
"calib/mu_w": 0.4771296296296298,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1719678714859438,
"calib/std_conf": 0.4215471637407645,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42699507389162555,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.17407892871014874,
"calib/step_q_w": 0.2529161451814768,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 444.671875,
"completions/mean_terminated_length": 449.9446716308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.10078582167625427,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0798,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.041414953768253326,
"mask/share_reasoning": 0.8224426507949829,
"mask/share_step_conf": 0.12442367523908615,
"num_tokens": 41919293.0,
"reward": 0.9268284440040588,
"reward_std": 0.15181460976600647,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7091015577316284,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8398678302764893,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.5862541794776917,
"adv/mean_abs_reasoning": 0.446882039308548,
"adv/mean_abs_step_conf": 0.7424435615539551,
"adv/ratio_final_to_reasoning": 1.3118767994900657,
"adv/ratio_step_to_reasoning": 1.6613859950664471,
"adv/std_final_conf": 0.8304627537727356,
"adv/std_reasoning": 0.7391940951347351,
"adv/std_step_conf": 0.9280949831008911,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7186588921282798,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.21861111111111103,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.3610667903525047,
"calib/mean_conf": 0.6678968253968254,
"calib/mu_c": 0.8083116883116884,
"calib/mu_w": 0.4472448979591837,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13769841269841263,
"calib/std_conf": 0.41985912847966006,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4870789865871833,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.1807310242047382,
"calib/step_q_w": 0.3063479623824451,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 442.6875,
"completions/mean_terminated_length": 446.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1984,
"grad_norm": 0.06984265148639679,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04010544717311859,
"mask/share_reasoning": 0.8295929431915283,
"mask/share_step_conf": 0.1224890649318695,
"num_tokens": 42137661.0,
"reward": 0.9584858417510986,
"reward_std": 0.1953909993171692,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7387632727622986,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8618021011352539,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.6203031539916992,
"adv/mean_abs_reasoning": 0.4454570710659027,
"adv/mean_abs_step_conf": 0.7624503374099731,
"adv/ratio_final_to_reasoning": 1.3925093893052807,
"adv/ratio_step_to_reasoning": 1.7116135020273888,
"adv/std_final_conf": 0.8146321773529053,
"adv/std_reasoning": 0.7013767957687378,
"adv/std_step_conf": 0.9314128160476685,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6024398395721924,
"calib/avg_num_step_conf": 6.375,
"calib/ece": 0.32418699186991873,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5609756097560976,
"calib/gap": 0.16650935828877012,
"calib/mean_conf": 0.6522357723577236,
"calib/mu_c": 0.7266911764705883,
"calib/mu_w": 0.5601818181818182,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21178861788617886,
"calib/std_conf": 0.4154881071410142,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42377253814147015,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.15810843276276543,
"calib/step_q_w": 0.2656641053787047,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 511.98046875,
"completions/mean_terminated_length": 516.0117797851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.054615847766399384,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0214,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03792574256658554,
"mask/share_reasoning": 0.8272412419319153,
"mask/share_step_conf": 0.12702052295207977,
"num_tokens": 42370272.0,
"reward": 0.8841220140457153,
"reward_std": 0.18903331458568573,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6271160244941711,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8426903486251831,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.5518736839294434,
"adv/mean_abs_reasoning": 0.45593172311782837,
"adv/mean_abs_step_conf": 0.729184627532959,
"adv/ratio_final_to_reasoning": 1.2104305446340269,
"adv/ratio_step_to_reasoning": 1.5993285629403609,
"adv/std_final_conf": 0.7792059779167175,
"adv/std_reasoning": 0.7393056750297546,
"adv/std_step_conf": 0.9334509372711182,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.744102598179281,
"calib/avg_num_step_conf": 5.1015625,
"calib/ece": 0.23091999999999985,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.604,
"calib/gap": 0.3446494783706558,
"calib/mean_conf": 0.6945200000000001,
"calib/mu_c": 0.833758389261745,
"calib/mu_w": 0.48910891089108915,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.16471999999999987,
"calib/std_conf": 0.39786828172147626,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4740087463556851,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.16036358506536247,
"calib/step_q_w": 0.3136451612903226,
"calib/step_q_w_n": 620.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2874.0,
"completions/max_terminated_length": 2874.0,
"completions/mean_length": 467.0078125,
"completions/mean_terminated_length": 474.420654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.05497977137565613,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0824,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.039705291390419006,
"mask/share_reasoning": 0.8246287107467651,
"mask/share_step_conf": 0.12004102021455765,
"num_tokens": 42593898.0,
"reward": 0.9408511519432068,
"reward_std": 0.20251992344856262,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7240562438964844,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.849052369594574,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6120193004608154,
"adv/mean_abs_reasoning": 0.39446377754211426,
"adv/mean_abs_step_conf": 0.7473738193511963,
"adv/ratio_final_to_reasoning": 1.5515221810080502,
"adv/ratio_step_to_reasoning": 1.8946576641537236,
"adv/std_final_conf": 0.8193371891975403,
"adv/std_reasoning": 0.7012815475463867,
"adv/std_step_conf": 0.933303713798523,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7354870129870129,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.2661417322834645,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.484251968503937,
"calib/gap": 0.3562532467532468,
"calib/mean_conf": 0.5514960629921261,
"calib/mu_c": 0.6917532467532468,
"calib/mu_w": 0.33549999999999996,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10566929133858265,
"calib/std_conf": 0.4516201433965361,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.43661951909476654,
"calib/step_q_c_n": 707.0,
"calib/step_q_gap": 0.14031884341909084,
"calib/step_q_w": 0.2963006756756757,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 440.23046875,
"completions/mean_terminated_length": 441.9568786621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.2016,
"grad_norm": 0.124534972012043,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0073,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03976032882928848,
"mask/share_reasoning": 0.8296130895614624,
"mask/share_step_conf": 0.1267203837633133,
"num_tokens": 42814365.0,
"reward": 0.9378255605697632,
"reward_std": 0.1679186224937439,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7129054665565491,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8463394045829773,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6316561698913574,
"adv/mean_abs_reasoning": 0.40052786469459534,
"adv/mean_abs_step_conf": 0.7750684022903442,
"adv/ratio_final_to_reasoning": 1.577059239993199,
"adv/ratio_step_to_reasoning": 1.9351173054622257,
"adv/std_final_conf": 0.8263714909553528,
"adv/std_reasoning": 0.661289632320404,
"adv/std_step_conf": 0.9340594410896301,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7126622348844571,
"calib/avg_num_step_conf": 5.3828125,
"calib/ece": 0.2652777777777779,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5119047619047619,
"calib/gap": 0.34605698005698016,
"calib/mean_conf": 0.579404761904762,
"calib/mu_c": 0.7400740740740741,
"calib/mu_w": 0.39401709401709395,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1544841269841271,
"calib/std_conf": 0.4439979386226176,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4244771723122238,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.11141565586014945,
"calib/step_q_w": 0.31306151645207436,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 508.71875,
"completions/mean_terminated_length": 512.7244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.06018049269914627,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0009,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03598202019929886,
"mask/share_reasoning": 0.8418542146682739,
"mask/share_step_conf": 0.1143513023853302,
"num_tokens": 43050205.0,
"reward": 0.9317029118537903,
"reward_std": 0.18911895155906677,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7130597829818726,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8480023145675659,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6036766767501831,
"adv/mean_abs_reasoning": 0.40422987937927246,
"adv/mean_abs_step_conf": 0.7332354784011841,
"adv/ratio_final_to_reasoning": 1.4933994431019728,
"adv/ratio_step_to_reasoning": 1.8139071746183786,
"adv/std_final_conf": 0.817194402217865,
"adv/std_reasoning": 0.7013508677482605,
"adv/std_step_conf": 0.9334839582443237,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6981868628210092,
"calib/avg_num_step_conf": 5.68359375,
"calib/ece": 0.25333333333333324,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5783132530120482,
"calib/gap": 0.33146535036778946,
"calib/mean_conf": 0.67285140562249,
"calib/mu_c": 0.8365873015873017,
"calib/mu_w": 0.5051219512195122,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21008032128514048,
"calib/std_conf": 0.4111599592084048,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45365625,
"calib/step_q_c_n": 640.0,
"calib/step_q_gap": 0.13104275306748464,
"calib/step_q_w": 0.32261349693251534,
"calib/step_q_w_n": 815.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2296.0,
"completions/max_terminated_length": 2296.0,
"completions/mean_length": 458.2265625,
"completions/mean_terminated_length": 461.83465576171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.0659196600317955,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0602,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04230055958032608,
"mask/share_reasoning": 0.8142014741897583,
"mask/share_step_conf": 0.1356854885816574,
"num_tokens": 43271679.0,
"reward": 0.9137133955955505,
"reward_std": 0.2043464332818985,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.699204683303833,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8352533578872681,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6116088628768921,
"adv/mean_abs_reasoning": 0.4545787572860718,
"adv/mean_abs_step_conf": 0.7597032785415649,
"adv/ratio_final_to_reasoning": 1.345440923214983,
"adv/ratio_step_to_reasoning": 1.6712247687884692,
"adv/std_final_conf": 0.8217902183532715,
"adv/std_reasoning": 0.7206497192382812,
"adv/std_step_conf": 0.9321048259735107,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7860244769048559,
"calib/avg_num_step_conf": 4.578125,
"calib/ece": 0.18760956175298799,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5816733067729084,
"calib/gap": 0.4638748519542044,
"calib/mean_conf": 0.6509561752988047,
"calib/mu_c": 0.8394630872483221,
"calib/mu_w": 0.37558823529411767,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12247011952191228,
"calib/std_conf": 0.4221108524837698,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.48903125000000003,
"calib/step_q_c_n": 640.0,
"calib/step_q_gap": 0.22457260338345864,
"calib/step_q_w": 0.2644586466165414,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2359.0,
"completions/max_terminated_length": 2359.0,
"completions/mean_length": 453.33984375,
"completions/mean_terminated_length": 456.9094543457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.2048,
"grad_norm": 0.0771113857626915,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0687,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04301437363028526,
"mask/share_reasoning": 0.830362856388092,
"mask/share_step_conf": 0.1188102513551712,
"num_tokens": 43492710.0,
"reward": 0.9810526371002197,
"reward_std": 0.19653618335723877,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7776480913162231,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.873519778251648,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.696414053440094,
"adv/mean_abs_reasoning": 0.503356397151947,
"adv/mean_abs_step_conf": 0.7308834195137024,
"adv/ratio_final_to_reasoning": 1.3835406828650458,
"adv/ratio_step_to_reasoning": 1.4520197292596886,
"adv/std_final_conf": 0.8705217242240906,
"adv/std_reasoning": 0.7575937509536743,
"adv/std_step_conf": 0.9286946058273315,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7478233034571063,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.21253968253968253,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.44841269841269843,
"calib/gap": 0.3764302176696542,
"calib/mean_conf": 0.5742063492063492,
"calib/mu_c": 0.7385211267605634,
"calib/mu_w": 0.3620909090909092,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11162698412698414,
"calib/std_conf": 0.420615296038167,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.41900151285930404,
"calib/step_q_c_n": 661.0,
"calib/step_q_gap": 0.12649426648249246,
"calib/step_q_w": 0.2925072463768116,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2465.0,
"completions/max_terminated_length": 2465.0,
"completions/mean_length": 477.4609375,
"completions/mean_terminated_length": 481.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.07757709175348282,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.1058,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037302643060684204,
"mask/share_reasoning": 0.8361595273017883,
"mask/share_step_conf": 0.11872531473636627,
"num_tokens": 43720652.0,
"reward": 0.9659968018531799,
"reward_std": 0.18238919973373413,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7463644742965698,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8785977363586426,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.5742350816726685,
"adv/mean_abs_reasoning": 0.44685792922973633,
"adv/mean_abs_step_conf": 0.7691583633422852,
"adv/ratio_final_to_reasoning": 1.285050670718759,
"adv/ratio_step_to_reasoning": 1.7212592930108874,
"adv/std_final_conf": 0.8032878637313843,
"adv/std_reasoning": 0.7014285922050476,
"adv/std_step_conf": 0.9336322546005249,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8076305994095752,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.17900197628458492,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5533596837944664,
"calib/gap": 0.4853922795533309,
"calib/mean_conf": 0.6321442687747035,
"calib/mu_c": 0.8355102040816328,
"calib/mu_w": 0.35011792452830187,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11505928853754935,
"calib/std_conf": 0.4271501896552184,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48788321167883203,
"calib/step_q_c_n": 685.0,
"calib/step_q_gap": 0.18552184384149378,
"calib/step_q_w": 0.30236136783733825,
"calib/step_q_w_n": 541.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1532.0,
"completions/max_terminated_length": 1532.0,
"completions/mean_length": 402.9296875,
"completions/mean_terminated_length": 406.10235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.12211701273918152,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04216302931308746,
"mask/share_reasoning": 0.8294909596443176,
"mask/share_step_conf": 0.12053349614143372,
"num_tokens": 43929746.0,
"reward": 0.9791326522827148,
"reward_std": 0.15515466034412384,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7964698672294617,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8500766754150391,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6161271333694458,
"adv/mean_abs_reasoning": 0.40150701999664307,
"adv/mean_abs_step_conf": 0.7621166110038757,
"adv/ratio_final_to_reasoning": 1.534536390857119,
"adv/ratio_step_to_reasoning": 1.8981401894548386,
"adv/std_final_conf": 0.8294500708580017,
"adv/std_reasoning": 0.6612933874130249,
"adv/std_step_conf": 0.9313527345657349,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7397683648903638,
"calib/avg_num_step_conf": 4.71875,
"calib/ece": 0.2267599999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.58,
"calib/gap": 0.37528986921725543,
"calib/mean_conf": 0.6614800000000001,
"calib/mu_c": 0.8251063829787233,
"calib/mu_w": 0.44981651376146786,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1621199999999999,
"calib/std_conf": 0.4196358059079325,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4617519379844961,
"calib/step_q_c_n": 645.0,
"calib/step_q_gap": 0.07622795929888326,
"calib/step_q_w": 0.38552397868561283,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 410.81640625,
"completions/mean_terminated_length": 415.6877746582031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.208,
"grad_norm": 0.07284358888864517,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0873,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.04061251878738403,
"mask/share_reasoning": 0.826056718826294,
"mask/share_step_conf": 0.12161204218864441,
"num_tokens": 44140899.0,
"reward": 0.9444481730461121,
"reward_std": 0.17379814386367798,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7354198694229126,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8480076193809509,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.4589390754699707,
"adv/mean_abs_reasoning": 0.2612670063972473,
"adv/mean_abs_step_conf": 0.7599738836288452,
"adv/ratio_final_to_reasoning": 1.7565902476494486,
"adv/ratio_step_to_reasoning": 2.9088015900229345,
"adv/std_final_conf": 0.7345681190490723,
"adv/std_reasoning": 0.5724842548370361,
"adv/std_step_conf": 0.9323859214782715,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7516464906228439,
"calib/avg_num_step_conf": 4.28125,
"calib/ece": 0.1903085937500001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.6328125,
"calib/gap": 0.39288044909991854,
"calib/mean_conf": 0.7239023437500001,
"calib/mu_c": 0.8881140939597316,
"calib/mu_w": 0.4952336448598131,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1660898437500001,
"calib/std_conf": 0.38376090456103895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5202241086587436,
"calib/step_q_c_n": 589.0,
"calib/step_q_gap": 0.14073692917156416,
"calib/step_q_w": 0.3794871794871794,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1233.0,
"completions/max_terminated_length": 1233.0,
"completions/mean_length": 344.3125,
"completions/mean_terminated_length": 345.66278076171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.07948347926139832,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0368,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04886038973927498,
"mask/share_reasoning": 0.8164427280426025,
"mask/share_step_conf": 0.1307906210422516,
"num_tokens": 44331587.0,
"reward": 0.9816206693649292,
"reward_std": 0.1148589700460434,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7804819941520691,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8663530349731445,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.5788639783859253,
"adv/mean_abs_reasoning": 0.43158918619155884,
"adv/mean_abs_step_conf": 0.7658077478408813,
"adv/ratio_final_to_reasoning": 1.3412383741445255,
"adv/ratio_step_to_reasoning": 1.7743904906388948,
"adv/std_final_conf": 0.7939836382865906,
"adv/std_reasoning": 0.6817267537117004,
"adv/std_step_conf": 0.9311909675598145,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7302751695357329,
"calib/avg_num_step_conf": 4.7578125,
"calib/ece": 0.2826399999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.568,
"calib/gap": 0.35685054773082925,
"calib/mean_conf": 0.65592,
"calib/mu_c": 0.858611111111111,
"calib/mu_w": 0.5017605633802817,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25327999999999995,
"calib/std_conf": 0.4114221112191225,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5152714932126696,
"calib/step_q_c_n": 442.0,
"calib/step_q_gap": 0.15917613238792738,
"calib/step_q_w": 0.35609536082474225,
"calib/step_q_w_n": 776.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2490.0,
"completions/max_terminated_length": 2490.0,
"completions/mean_length": 424.859375,
"completions/mean_terminated_length": 431.60321044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.06191324070096016,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0622,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.040418848395347595,
"mask/share_reasoning": 0.8236619234085083,
"mask/share_step_conf": 0.1202942505478859,
"num_tokens": 44545407.0,
"reward": 0.912652850151062,
"reward_std": 0.18009766936302185,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.6936922073364258,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8519259691238403,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.552986741065979,
"adv/mean_abs_reasoning": 0.3963276445865631,
"adv/mean_abs_step_conf": 0.7611110806465149,
"adv/ratio_final_to_reasoning": 1.3952767328225057,
"adv/ratio_step_to_reasoning": 1.9204087603843096,
"adv/std_final_conf": 0.802998960018158,
"adv/std_reasoning": 0.6814876198768616,
"adv/std_step_conf": 0.9322589039802551,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7270659354356226,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.23898039215686262,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4666666666666667,
"calib/gap": 0.3391617848685031,
"calib/mean_conf": 0.5709019607843139,
"calib/mu_c": 0.7211971830985916,
"calib/mu_w": 0.3820353982300885,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12650980392156852,
"calib/std_conf": 0.42496732307605706,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45695652173913043,
"calib/step_q_c_n": 690.0,
"calib/step_q_gap": 0.10902057867863224,
"calib/step_q_w": 0.3479359430604982,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2413.0,
"completions/max_terminated_length": 2413.0,
"completions/mean_length": 410.4453125,
"completions/mean_terminated_length": 410.4453125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.2112,
"grad_norm": 0.078434057533741,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0446,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04607347771525383,
"mask/share_reasoning": 0.8190134763717651,
"mask/share_step_conf": 0.13491299748420715,
"num_tokens": 44755865.0,
"reward": 0.9583539962768555,
"reward_std": 0.1432131975889206,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7369366884231567,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8696149587631226,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.661697506904602,
"adv/mean_abs_reasoning": 0.5621737837791443,
"adv/mean_abs_step_conf": 0.7594678997993469,
"adv/ratio_final_to_reasoning": 1.177033732267666,
"adv/ratio_step_to_reasoning": 1.350948624274005,
"adv/std_final_conf": 0.8686993718147278,
"adv/std_reasoning": 0.8265514969825745,
"adv/std_step_conf": 0.9334787130355835,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7286388140161726,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.23304878048780484,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5487804878048781,
"calib/gap": 0.34104851752021564,
"calib/mean_conf": 0.6541869918699187,
"calib/mu_c": 0.8011428571428573,
"calib/mu_w": 0.4600943396226416,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15906504065040647,
"calib/std_conf": 0.4130591179876554,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.437112970711297,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.1297291725705268,
"calib/step_q_w": 0.3073837981407702,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2308.0,
"completions/max_terminated_length": 2308.0,
"completions/mean_length": 504.63671875,
"completions/mean_terminated_length": 514.6892700195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.09063680469989777,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0419,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03921530395746231,
"mask/share_reasoning": 0.8160558342933655,
"mask/share_step_conf": 0.1251976191997528,
"num_tokens": 44989252.0,
"reward": 0.9256468415260315,
"reward_std": 0.23798325657844543,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7116104960441589,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8396830558776855,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5623447895050049,
"adv/mean_abs_reasoning": 0.45310142636299133,
"adv/mean_abs_step_conf": 0.7232848405838013,
"adv/ratio_final_to_reasoning": 1.2411013446126207,
"adv/ratio_step_to_reasoning": 1.5962978673220043,
"adv/std_final_conf": 0.7876421809196472,
"adv/std_reasoning": 0.7206158638000488,
"adv/std_step_conf": 0.9335463047027588,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8116764514024788,
"calib/avg_num_step_conf": 4.26953125,
"calib/ece": 0.16904382470119506,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5896414342629482,
"calib/gap": 0.5388421395955645,
"calib/mean_conf": 0.634382470119522,
"calib/mu_c": 0.8597945205479454,
"calib/mu_w": 0.32095238095238093,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11087649402390425,
"calib/std_conf": 0.4465395021981601,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.507291280148423,
"calib/step_q_c_n": 539.0,
"calib/step_q_gap": 0.22765229097874784,
"calib/step_q_w": 0.2796389891696751,
"calib/step_q_w_n": 554.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2677.0,
"completions/max_terminated_length": 2677.0,
"completions/mean_length": 466.46875,
"completions/mean_terminated_length": 470.1417236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.06912390142679214,
"learning_rate": 0.0,
"loss": -0.0252,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04253852739930153,
"mask/share_reasoning": 0.842212975025177,
"mask/share_step_conf": 0.10743597149848938,
"num_tokens": 45216716.0,
"reward": 0.9851292371749878,
"reward_std": 0.18738599121570587,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7934476733207703,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8682170510292053,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.022248380884993823,
"train_runtime": 12721.3125,
"train_samples_per_second": 4.025,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 45216716,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}